From 6e98e3f68e40769c8ba5a049a85b483eaac45a66 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 22 May 2024 04:12:01 -0700 Subject: [PATCH 01/58] not buildable: remove host device from device_impl.* Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/device_impl.cpp | 38 +++++------------------------- sycl/source/detail/device_impl.hpp | 14 ----------- 2 files changed, 6 insertions(+), 46 deletions(-) diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp index 532cffe22500f..d043a59d9cebd 100644 --- a/sycl/source/detail/device_impl.cpp +++ b/sycl/source/detail/device_impl.cpp @@ -17,11 +17,6 @@ namespace sycl { inline namespace _V1 { namespace detail { -device_impl::device_impl() - : MIsHostDevice(true), MPlatform(platform_impl::getHostPlatformImpl()), - // assert is natively supported by host - MIsAssertFailSupported(true) {} - device_impl::device_impl(pi_native_handle InteropDeviceHandle, const PluginPtr &Plugin) : device_impl(InteropDeviceHandle, nullptr, nullptr, Plugin) {} @@ -39,7 +34,7 @@ device_impl::device_impl(sycl::detail::pi::PiDevice Device, device_impl::device_impl(pi_native_handle InteropDeviceHandle, sycl::detail::pi::PiDevice Device, PlatformImplPtr Platform, const PluginPtr &Plugin) - : MDevice(Device), MIsHostDevice(false), + : MDevice(Device), MDeviceHostBaseTime(std::make_pair(0, 0)) { bool InteroperabilityConstructor = false; @@ -84,13 +79,11 @@ device_impl::device_impl(pi_native_handle InteropDeviceHandle, } device_impl::~device_impl() { - if (!MIsHostDevice) { - // TODO catch an exception and put it to list of asynchronous exceptions - const PluginPtr &Plugin = getPlugin(); - sycl::detail::pi::PiResult Err = - Plugin->call_nocheck(MDevice); - __SYCL_CHECK_OCL_CODE_NO_EXC(Err); - } + // TODO catch an exception and put it to list of asynchronous exceptions + const PluginPtr &Plugin = getPlugin(); + sycl::detail::pi::PiResult Err = + Plugin->call_nocheck(MDevice); + __SYCL_CHECK_OCL_CODE_NO_EXC(Err); } bool device_impl::is_affinity_supported( @@ -101,11 +94,6 @@ bool device_impl::is_affinity_supported( } cl_device_id device_impl::get() const { - if (MIsHostDevice) { - throw invalid_object_error( - "This instance of device doesn't support OpenCL interoperability.", - PI_ERROR_INVALID_DEVICE); - } // TODO catch an exception and put it to list of asynchronous exceptions getPlugin()->call(MDevice); return pi::cast(getNative()); @@ -180,9 +168,6 @@ device_impl::get_backend_info() const { } bool device_impl::has_extension(const std::string &ExtensionName) const { - if (MIsHostDevice) - // TODO: implement extension management for host device; - return false; std::string AllExtensionNames = get_device_info_string(PiInfoCode::value); return (AllExtensionNames.find(ExtensionName) != std::string::npos); @@ -224,8 +209,6 @@ device_impl::create_sub_devices(const cl_device_partition_property *Properties, } std::vector device_impl::create_sub_devices(size_t ComputeUnits) const { - assert(!MIsHostDevice && "Partitioning is not supported on host."); - if (!is_partition_supported(info::partition_property::partition_equally)) { throw sycl::feature_not_supported( "Device does not support " @@ -248,8 +231,6 @@ std::vector device_impl::create_sub_devices(size_t ComputeUnits) const { std::vector device_impl::create_sub_devices(const std::vector &Counts) const { - assert(!MIsHostDevice && "Partitioning is not supported on host."); - if (!is_partition_supported(info::partition_property::partition_by_counts)) { throw sycl::feature_not_supported( "Device does not support " @@ -291,8 +272,6 @@ device_impl::create_sub_devices(const std::vector &Counts) const { std::vector device_impl::create_sub_devices( info::partition_affinity_domain AffinityDomain) const { - assert(!MIsHostDevice && "Partitioning is not supported on host."); - if (!is_partition_supported( info::partition_property::partition_by_affinity_domain)) { throw sycl::feature_not_supported( @@ -319,8 +298,6 @@ std::vector device_impl::create_sub_devices( } std::vector device_impl::create_sub_devices() const { - assert(!MIsHostDevice && "Partitioning is not supported on host."); - if (!is_partition_supported( info::partition_property::ext_intel_partition_by_cslice)) { throw sycl::feature_not_supported( @@ -789,9 +766,6 @@ uint64_t device_impl::getCurrentDeviceTime() { uint64_t HostTime = duration_cast(steady_clock::now().time_since_epoch()) .count(); - if (MIsHostDevice) { - return HostTime; - } // To account for potential clock drift between host clock and device clock. // The value set is arbitrary: 200 seconds diff --git a/sycl/source/detail/device_impl.hpp b/sycl/source/detail/device_impl.hpp index 981b1e059a30e..2526647152892 100644 --- a/sycl/source/detail/device_impl.hpp +++ b/sycl/source/detail/device_impl.hpp @@ -65,10 +65,6 @@ class device_impl { /// /// \return non-constant reference to PI device sycl::detail::pi::PiDevice &getHandleRef() { - if (MIsHostDevice) - throw invalid_object_error("This instance of device is a host instance", - PI_ERROR_INVALID_DEVICE); - return MDevice; } @@ -78,18 +74,9 @@ class device_impl { /// /// \return constant reference to PI device const sycl::detail::pi::PiDevice &getHandleRef() const { - if (MIsHostDevice) - throw invalid_object_error("This instance of device is a host instance", - PI_ERROR_INVALID_DEVICE); - return MDevice; } - /// Check if SYCL device is a host device - /// - /// \return true if SYCL device is a host device - bool is_host() const { return MIsHostDevice; } - /// Check if device is a CPU device /// /// \return true if SYCL device is a CPU device @@ -327,7 +314,6 @@ class device_impl { sycl::detail::pi::PiDevice MDevice = 0; sycl::detail::pi::PiDeviceType MType; sycl::detail::pi::PiDevice MRootDevice = nullptr; - bool MIsHostDevice; PlatformImplPtr MPlatform; bool MIsAssertFailSupported = false; mutable std::string MDeviceName; From abe4586ce16a07b69a1d2c662679697754db00a2 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 22 May 2024 04:13:51 -0700 Subject: [PATCH 02/58] not-buildable: remove getHostPlatformImpl Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/context_impl.cpp | 3 --- sycl/source/detail/device_info.hpp | 4 ---- sycl/source/detail/platform_impl.cpp | 6 ------ sycl/source/detail/platform_impl.hpp | 8 -------- 4 files changed, 21 deletions(-) diff --git a/sycl/source/detail/context_impl.cpp b/sycl/source/detail/context_impl.cpp index 388c312305d4a..c2124456dae24 100644 --- a/sycl/source/detail/context_impl.cpp +++ b/sycl/source/detail/context_impl.cpp @@ -177,9 +177,6 @@ uint32_t context_impl::get_info() const { this->getPlugin()); } template <> platform context_impl::get_info() const { - if (is_host()) - return createSyclObjFromImpl( - platform_impl::getHostPlatformImpl()); return createSyclObjFromImpl(MPlatform); } template <> diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp index a8769b69e83cc..61cb09e1b0b38 100644 --- a/sycl/source/detail/device_info.hpp +++ b/sycl/source/detail/device_info.hpp @@ -1802,10 +1802,6 @@ get_device_info_host() { return {}; } -template <> inline platform get_device_info_host() { - return createSyclObjFromImpl(platform_impl::getHostPlatformImpl()); -} - template <> inline std::string get_device_info_host() { return "SYCL host device"; } diff --git a/sycl/source/detail/platform_impl.cpp b/sycl/source/detail/platform_impl.cpp index 2bdfab26676d9..9700fde466803 100644 --- a/sycl/source/detail/platform_impl.cpp +++ b/sycl/source/detail/platform_impl.cpp @@ -30,12 +30,6 @@ namespace detail { using PlatformImplPtr = std::shared_ptr; -PlatformImplPtr platform_impl::getHostPlatformImpl() { - static PlatformImplPtr HostImpl = std::make_shared(); - - return HostImpl; -} - PlatformImplPtr platform_impl::getOrMakePlatformImpl(sycl::detail::pi::PiPlatform PiPlatform, const PluginPtr &Plugin) { diff --git a/sycl/source/detail/platform_impl.hpp b/sycl/source/detail/platform_impl.hpp index 34537c7191af6..0bb8d1ab77e2f 100644 --- a/sycl/source/detail/platform_impl.hpp +++ b/sycl/source/detail/platform_impl.hpp @@ -192,14 +192,6 @@ class platform_impl { getOrMakeDeviceImpl(sycl::detail::pi::PiDevice PiDevice, const std::shared_ptr &PlatformImpl); - /// Static functions that help maintain platform uniquess and - /// equality of comparison - - /// Returns the host platform impl - /// - /// \return the host platform impl - static std::shared_ptr getHostPlatformImpl(); - /// Queries the cache to see if the specified PiPlatform has been seen /// before. If so, return the cached platform_impl, otherwise create a new /// one and cache it. From 6a0a25005b1b9b831419e94ed56b0bb8f15b4017 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 22 May 2024 04:18:11 -0700 Subject: [PATCH 03/58] not buildable: remove get_device_info_host Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/device_impl.cpp | 3 - sycl/source/detail/device_info.hpp | 1032 ---------------------------- 2 files changed, 1035 deletions(-) diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp index d043a59d9cebd..2e87300425c20 100644 --- a/sycl/source/detail/device_impl.cpp +++ b/sycl/source/detail/device_impl.cpp @@ -105,9 +105,6 @@ platform device_impl::get_platform() const { template typename Param::return_type device_impl::get_info() const { - if (is_host()) { - return get_device_info_host(); - } return get_device_info( MPlatform->getOrMakeDeviceImpl(MDevice, MPlatform)); } diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp index 61cb09e1b0b38..9322b65128652 100644 --- a/sycl/source/detail/device_info.hpp +++ b/sycl/source/detail/device_info.hpp @@ -1272,1038 +1272,6 @@ typename Param::return_type get_device_info(const DeviceImplPtr &Dev) { return get_device_info_impl::get(Dev); } -// SYCL host device information - -// Default template is disabled, all possible instantiations are -// specified explicitly. -template -inline typename Param::return_type get_device_info_host() = delete; - -template <> -inline std::vector get_device_info_host() { - return std::vector(); -} - -template <> -inline ext::oneapi::experimental::architecture -get_device_info_host() { - return ext::oneapi::experimental::architecture::x86_64; -} - -template <> -inline info::device_type get_device_info_host() { - return info::device_type::host; -} - -template <> inline uint32_t get_device_info_host() { - return 0x8086; -} - -template <> -inline uint32_t get_device_info_host() { - return std::thread::hardware_concurrency(); -} - -template <> -inline uint32_t get_device_info_host() { - return 3; -} - -template <> -inline range<1> get_device_info_host>() { - // current value is the required minimum - return {1}; -} - -template <> -inline range<2> get_device_info_host>() { - // current value is the required minimum - return {1, 1}; -} - -template <> -inline range<3> get_device_info_host>() { - // current value is the required minimum - return {1, 1, 1}; -} - -template <> -inline constexpr size_t get_device_info_host< - ext::oneapi::experimental::info::device::max_global_work_groups>() { - // See handler.hpp for the maximum value : - return static_cast((std::numeric_limits::max)()); -} - -template <> -inline id<1> get_device_info_host< - ext::oneapi::experimental::info::device::max_work_groups<1>>() { - // See handler.hpp for the maximum value : - static constexpr size_t Limit = get_device_info_host< - ext::oneapi::experimental::info::device::max_global_work_groups>(); - return {Limit}; -} - -template <> -inline id<2> get_device_info_host< - ext::oneapi::experimental::info::device::max_work_groups<2>>() { - // See handler.hpp for the maximum value : - static constexpr size_t Limit = get_device_info_host< - ext::oneapi::experimental::info::device::max_global_work_groups>(); - return {Limit, Limit}; -} - -template <> -inline id<3> get_device_info_host< - ext::oneapi::experimental::info::device::max_work_groups<3>>() { - // See handler.hpp for the maximum value : - static constexpr size_t Limit = get_device_info_host< - ext::oneapi::experimental::info::device::max_global_work_groups>(); - return {Limit, Limit, Limit}; -} - -// TODO:remove with deprecated feature -// device::get_info -template <> -inline constexpr size_t -get_device_info_host() { - return get_device_info_host< - ext::oneapi::experimental::info::device::max_global_work_groups>(); -} - -// TODO:remove with deprecated feature -// device::get_info -template <> -inline id<1> -get_device_info_host() { - - return get_device_info_host< - ext::oneapi::experimental::info::device::max_work_groups<1>>(); -} - -// TODO:remove with deprecated feature -// device::get_info -template <> -inline id<2> -get_device_info_host() { - return get_device_info_host< - ext::oneapi::experimental::info::device::max_work_groups<2>>(); -} - -// TODO:remove with deprecated feature -// device::get_info -template <> -inline id<3> -get_device_info_host() { - return get_device_info_host< - ext::oneapi::experimental::info::device::max_work_groups<3>>(); -} - -template <> -inline size_t get_device_info_host() { - // current value is the required minimum - return 1; -} - -template <> -inline uint32_t -get_device_info_host() { - // TODO update when appropriate - return 1; -} - -template <> -inline uint32_t -get_device_info_host() { - // TODO update when appropriate - return 1; -} - -template <> -inline uint32_t -get_device_info_host() { - // TODO update when appropriate - return 1; -} - -template <> -inline uint32_t -get_device_info_host() { - // TODO update when appropriate - return 1; -} - -template <> -inline uint32_t -get_device_info_host() { - // TODO update when appropriate - return 1; -} - -template <> -inline uint32_t -get_device_info_host() { - // TODO update when appropriate - return 1; -} - -template <> -inline uint32_t -get_device_info_host() { - // TODO update when appropriate - return 0; -} - -template <> -inline uint32_t get_device_info_host() { - return PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex::Char); -} - -template <> -inline uint32_t -get_device_info_host() { - return PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex::Short); -} - -template <> -inline uint32_t get_device_info_host() { - return PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex::Int); -} - -template <> -inline uint32_t get_device_info_host() { - return PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex::Long); -} - -template <> -inline uint32_t -get_device_info_host() { - return PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex::Float); -} - -template <> -inline uint32_t -get_device_info_host() { - return PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex::Double); -} - -template <> -inline uint32_t get_device_info_host() { - return PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex::Half); -} - -template <> -inline uint32_t get_device_info_host() { - return PlatformUtil::getMaxClockFrequency(); -} - -template <> inline uint32_t get_device_info_host() { - return sizeof(void *) * 8; -} - -template <> -inline uint64_t get_device_info_host() { - return static_cast(OSUtil::getOSMemSize()); -} - -template <> -inline uint64_t get_device_info_host() { - // current value is the required minimum - const uint64_t a = get_device_info_host() / 4; - const uint64_t b = 128ul * 1024 * 1024; - return (a > b) ? a : b; -} - -template <> inline bool get_device_info_host() { - return true; -} - -template <> inline bool get_device_info_host() { - return false; -} - -template <> -inline std::vector -get_device_info_host() { - return {memory_order::relaxed, memory_order::acquire, memory_order::release, - memory_order::acq_rel, memory_order::seq_cst}; -} - -template <> -inline std::vector -get_device_info_host() { - return {memory_order::relaxed, memory_order::acquire, memory_order::release, - memory_order::acq_rel}; -} - -template <> -inline std::vector -get_device_info_host() { - return {memory_scope::work_item, memory_scope::sub_group, - memory_scope::work_group, memory_scope::device, memory_scope::system}; -} - -template <> -inline std::vector -get_device_info_host() { - return {memory_scope::work_item, memory_scope::sub_group, - memory_scope::work_group, memory_scope::device, memory_scope::system}; -} - -template <> -inline bool -get_device_info_host() { - return false; -} - -template <> -inline uint32_t get_device_info_host() { - // current value is the required minimum - return 128; -} - -template <> -inline uint32_t get_device_info_host() { - // current value is the required minimum - return 8; -} - -template <> -inline size_t get_device_info_host() { - // SYCL guarantees at least 8192. Some devices already known to provide more - // than that (i.e. it is 16384 for opencl:gpu), which may create issues during - // image object allocation on host. - // Using any fixed number (i.e. 16384) brings the risk of having similar - // issues on newer devices in future. Thus it does not make sense limiting - // the returned value on host. Practially speaking the returned value on host - // depends only on memory required for the image, which also depends on - // the image channel_type and the image height. Both are not known in this - // query, thus it becomes user's responsibility to choose proper image - // parameters depending on similar query to (non-host device) and amount - // of available/allocatable memory. - return std::numeric_limits::max(); -} - -template <> -inline size_t get_device_info_host() { - // SYCL guarantees at least 8192. Some devices already known to provide more - // than that (i.e. it is 16384 for opencl:gpu), which may create issues during - // image object allocation on host. - // Using any fixed number (i.e. 16384) brings the risk of having similar - // issues on newer devices in future. Thus it does not make sense limiting - // the returned value on host. Practially speaking the returned value on host - // depends only on memory required for the image, which also depends on - // the image channel_type and the image width. Both are not known in this - // query, thus it becomes user's responsibility to choose proper image - // parameters depending on similar query to (non-host device) and amount - // of available/allocatable memory. - return std::numeric_limits::max(); -} - -template <> -inline size_t get_device_info_host() { - // SYCL guarantees at least 8192. Some devices already known to provide more - // than that (i.e. it is 16384 for opencl:gpu), which may create issues during - // image object allocation on host. - // Using any fixed number (i.e. 16384) brings the risk of having similar - // issues on newer devices in future. Thus it does not make sense limiting - // the returned value on host. Practially speaking the returned value on host - // depends only on memory required for the image, which also depends on - // the image channel_type and the image height/depth. Both are not known - // in this query, thus it becomes user's responsibility to choose proper image - // parameters depending on similar query to (non-host device) and amount - // of available/allocatable memory. - return std::numeric_limits::max(); -} - -template <> -inline size_t get_device_info_host() { - // SYCL guarantees at least 8192. Some devices already known to provide more - // than that (i.e. it is 16384 for opencl:gpu), which may create issues during - // image object allocation on host. - // Using any fixed number (i.e. 16384) brings the risk of having similar - // issues on newer devices in future. Thus it does not make sense limiting - // the returned value on host. Practially speaking the returned value on host - // depends only on memory required for the image, which also depends on - // the image channel_type and the image width/depth. Both are not known - // in this query, thus it becomes user's responsibility to choose proper image - // parameters depending on similar query to (non-host device) and amount - // of available/allocatable memory. - return std::numeric_limits::max(); -} - -template <> -inline size_t get_device_info_host() { - // SYCL guarantees at least 8192. Some devices already known to provide more - // than that (i.e. it is 16384 for opencl:gpu), which may create issues during - // image object allocation on host. - // Using any fixed number (i.e. 16384) brings the risk of having similar - // issues on newer devices in future. Thus it does not make sense limiting - // the returned value on host. Practially speaking the returned value on host - // depends only on memory required for the image, which also depends on - // the image channel_type and the image height/width, which are not known - // in this query, thus it becomes user's responsibility to choose proper image - // parameters depending on similar query to (non-host device) and amount - // of available/allocatable memory. - return std::numeric_limits::max(); -} - -template <> -inline size_t get_device_info_host() { - // Not supported in SYCL - return 0; -} - -template <> -inline size_t get_device_info_host() { - // current value is the required minimum - return 2048; -} - -template <> inline uint32_t get_device_info_host() { - // current value is the required minimum - return 16; -} - -template <> -inline size_t get_device_info_host() { - // current value is the required minimum - return 1024; -} - -template <> -inline uint32_t get_device_info_host() { - return 1024; -} - -template <> -inline std::vector -get_device_info_host() { - // current value is the required minimum - return {}; -} - -template <> -inline std::vector -get_device_info_host() { - // current value is the required minimum - return {info::fp_config::round_to_nearest, info::fp_config::inf_nan}; -} - -template <> -inline std::vector -get_device_info_host() { - // current value is the required minimum - return {info::fp_config::fma, info::fp_config::round_to_nearest, - info::fp_config::round_to_zero, info::fp_config::round_to_inf, - info::fp_config::inf_nan, info::fp_config::denorm}; -} - -template <> -inline info::global_mem_cache_type -get_device_info_host() { - return info::global_mem_cache_type::read_write; -} - -template <> -inline uint32_t -get_device_info_host() { - return PlatformUtil::getMemCacheLineSize(); -} - -template <> -inline uint64_t get_device_info_host() { - return PlatformUtil::getMemCacheSize(); -} - -template <> -inline uint64_t get_device_info_host() { - // current value is the required minimum - return 64 * 1024; -} - -template <> -inline uint32_t get_device_info_host() { - // current value is the required minimum - return 8; -} - -template <> -inline info::local_mem_type -get_device_info_host() { - return info::local_mem_type::global; -} - -template <> -inline uint64_t get_device_info_host() { - // current value is the required minimum - return 32 * 1024; -} - -template <> -inline bool get_device_info_host() { - return false; -} - -template <> -inline bool get_device_info_host() { - return true; -} - -template <> -inline size_t get_device_info_host() { - typedef std::ratio_divide - ns_period; - return ns_period::num / ns_period::den; -} - -template <> inline bool get_device_info_host() { - union { - uint16_t a; - uint8_t b[2]; - } u = {0x0100}; - - return u.b[1]; -} - -template <> inline bool get_device_info_host() { - return true; -} - -template <> -inline bool get_device_info_host() { - return true; -} - -template <> -inline bool get_device_info_host() { - return true; -} - -template <> -inline std::vector -get_device_info_host() { - return {info::execution_capability::exec_kernel}; -} - -template <> inline bool get_device_info_host() { - return true; -} - -template <> -inline std::vector -get_device_info_host() { - return {}; -} - -template <> -inline std::vector -get_device_info_host() { - return {}; -} - -template <> inline std::string get_device_info_host() { - return "SYCL host device"; -} - -template <> inline std::string get_device_info_host() { - return ""; -} - -template <> -inline std::string get_device_info_host() { - return "1.2"; -} - -template <> inline std::string get_device_info_host() { - return "FULL PROFILE"; -} - -template <> inline std::string get_device_info_host() { - return "1.2"; -} - -template <> -inline std::string get_device_info_host() { - return "not applicable"; -} - -template <> -inline std::vector -get_device_info_host() { - // TODO update when appropriate - return {}; -} - -template <> -inline size_t get_device_info_host() { - // current value is the required minimum - return 1024 * 1024; -} - -template <> -inline bool get_device_info_host() { - return false; -} - -template <> inline device get_device_info_host() { - throw invalid_object_error( - "Partitioning to subdevices of the host device is not implemented", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline uint32_t -get_device_info_host() { - // TODO update once subdevice creation is enabled - return 1; -} - -template <> -inline std::vector -get_device_info_host() { - // TODO update once subdevice creation is enabled - return {}; -} - -template <> -inline std::vector -get_device_info_host() { - // TODO update once subdevice creation is enabled - return {}; -} - -template <> -inline info::partition_property -get_device_info_host() { - return info::partition_property::no_partition; -} - -template <> -inline info::partition_affinity_domain -get_device_info_host() { - // TODO update once subdevice creation is enabled - return info::partition_affinity_domain::not_applicable; -} - -template <> -inline uint32_t get_device_info_host() { - // TODO update once subdevice creation is enabled - return 1; -} - -template <> -inline uint32_t get_device_info_host() { - // TODO update once subgroups are enabled - throw runtime_error("Sub-group feature is not supported on HOST device.", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline std::vector -get_device_info_host() { - // TODO update once subgroups are enabled - throw runtime_error("Sub-group feature is not supported on HOST device.", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline bool -get_device_info_host() { - // TODO update once subgroups are enabled - throw runtime_error("Sub-group feature is not supported on HOST device.", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline bool get_device_info_host() { - return false; -} - -template <> -inline std::string get_device_info_host() { - throw runtime_error( - "Backend version feature is not supported on HOST device.", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline bool get_device_info_host() { - return true; -} - -template <> -inline bool get_device_info_host() { - return true; -} - -template <> -inline bool get_device_info_host() { - return true; -} - -template <> -inline bool -get_device_info_host() { - return true; -} - -template <> -inline bool get_device_info_host() { - return true; -} - -template <> -inline bool get_device_info_host() { - return false; -} - -// Specializations for intel extensions for Level Zero low-level -// detail device descriptors (not support on host). -template <> -inline uint32_t get_device_info_host() { - throw runtime_error("Obtaining the device ID is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -template <> -inline std::string -get_device_info_host() { - throw runtime_error( - "Obtaining the PCI address is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -template <> -inline uint32_t get_device_info_host() { - throw runtime_error("Obtaining the EU count is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -template <> -inline uint32_t -get_device_info_host() { - throw runtime_error( - "Obtaining the EU SIMD width is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -template <> -inline uint32_t get_device_info_host() { - throw runtime_error( - "Obtaining the number of slices is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -template <> -inline uint32_t -get_device_info_host() { - throw runtime_error("Obtaining the number of subslices per slice is not " - "supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -template <> -inline uint32_t -get_device_info_host() { - throw runtime_error( - "Obtaining the EU count per subslice is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -template <> -inline uint32_t -get_device_info_host() { - throw runtime_error( - "Obtaining the HW threads count per EU is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -template <> -inline uint64_t -get_device_info_host() { - throw runtime_error( - "Obtaining the maximum memory bandwidth is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -template <> -inline detail::uuid_type -get_device_info_host() { - throw runtime_error( - "Obtaining the device uuid is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} - -// TODO: Remove with deprecated feature -// device::get_info() -template <> -inline std::string get_device_info_host() { - throw runtime_error( - "Obtaining the PCI address is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -// TODO: Remove with deprecated feature -// device::get_info() -template <> -inline uint32_t get_device_info_host() { - throw runtime_error("Obtaining the EU count is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -// TODO: Remove with deprecated feature -// device::get_info() -template <> -inline uint32_t -get_device_info_host() { - throw runtime_error( - "Obtaining the EU SIMD width is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -// TODO: Remove with deprecated feature -// device::get_info() -template <> -inline uint32_t get_device_info_host() { - throw runtime_error( - "Obtaining the number of slices is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -// TODO: Remove with deprecated feature -// device::get_info() -template <> -inline uint32_t -get_device_info_host() { - throw runtime_error("Obtaining the number of subslices per slice is not " - "supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -// TODO: Remove with deprecated feature -// device::get_info() -template <> -inline uint32_t -get_device_info_host() { - throw runtime_error( - "Obtaining the EU count per subslice is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -// TODO: Remove with deprecated feature -// device::get_info() -template <> -inline uint32_t -get_device_info_host() { - throw runtime_error( - "Obtaining the HW threads count per EU is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -// TODO: Remove with deprecated feature -// device::get_info() -template <> -inline uint64_t -get_device_info_host() { - throw runtime_error( - "Obtaining the maximum memory bandwidth is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -// TODO:Move to namespace ext::intel::info::device -template <> inline bool get_device_info_host() { - return false; -} - -// TODO: Remove with deprecated feature -// device::get_info() -template <> -inline detail::uuid_type -get_device_info_host() { - throw runtime_error( - "Obtaining the device uuid is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline uint64_t get_device_info_host() { - throw runtime_error( - "Obtaining the device free memory is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline uint32_t -get_device_info_host() { - throw runtime_error( - "Obtaining the device memory clock rate is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline uint32_t -get_device_info_host() { - throw runtime_error( - "Obtaining the device memory bus width is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline int32_t -get_device_info_host() { - throw runtime_error( - "Obtaining max compute queue indices is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline bool get_device_info_host< - ext::codeplay::experimental::info::device::supports_fusion>() { - // No support for fusion on the host device. - return false; -} - -template <> -inline uint32_t get_device_info_host< - ext::codeplay::experimental::info::device::max_registers_per_work_group>() { - throw runtime_error("Obtaining the maximum number of available registers per " - "work-group is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline uint32_t get_device_info_host< - ext::oneapi::experimental::info::device::image_row_pitch_align>() { - throw runtime_error("Obtaining image pitch alignment is not " - "supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline uint32_t get_device_info_host< - ext::oneapi::experimental::info::device::max_image_linear_row_pitch>() { - throw runtime_error("Obtaining max image linear pitch is not " - "supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline std::vector -get_device_info_host< - ext::oneapi::experimental::info::device::matrix_combinations>() { - throw runtime_error("Obtaining matrix combinations is not " - "supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline uint32_t get_device_info_host< - ext::oneapi::experimental::info::device::max_image_linear_width>() { - throw runtime_error("Obtaining max image linear width is not " - "supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline uint32_t get_device_info_host< - ext::oneapi::experimental::info::device::max_image_linear_height>() { - throw runtime_error("Obtaining max image linear height is not " - "supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline float get_device_info_host< - ext::oneapi::experimental::info::device::mipmap_max_anisotropy>() { - throw runtime_error("Bindless image mipaps are not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline std::vector get_device_info_host< - ext::oneapi::experimental::info::device::component_devices>() { - throw runtime_error("Host devices cannot be component devices.", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline sycl::device get_device_info_host< - ext::oneapi::experimental::info::device::composite_device>() { - throw runtime_error("Host devices cannot be composite devices.", - PI_ERROR_INVALID_DEVICE); -} - -// Returns the list of all progress guarantees that can be requested for -// work_groups from the coordination level of root_group when using host device. -// First it calls getHostProgressGuarantee to get the strongest guarantee -// available and then calls getProgressGuaranteesUpTo to get a list of all -// guarantees that are either equal to the strongest guarantee or weaker than -// it. The next 5 definitions follow the same model but for different scopes. -template <> -inline std::vector -get_device_info_host< - ext::oneapi::experimental::info::device::work_group_progress_capabilities< - ext::oneapi::experimental::execution_scope::root_group>>() { - - using execution_scope = ext::oneapi::experimental::execution_scope; - using ReturnT = - std::vector; - return device_impl::getProgressGuaranteesUpTo( - device_impl::getHostProgressGuarantee(execution_scope::work_group, - execution_scope::root_group)); -} - -template <> -inline std::vector -get_device_info_host< - ext::oneapi::experimental::info::device::sub_group_progress_capabilities< - ext::oneapi::experimental::execution_scope::root_group>>() { - - using execution_scope = ext::oneapi::experimental::execution_scope; - using ReturnT = - std::vector; - return device_impl::getProgressGuaranteesUpTo( - device_impl::getHostProgressGuarantee(execution_scope::sub_group, - execution_scope::root_group)); -} - -template <> -inline std::vector -get_device_info_host< - ext::oneapi::experimental::info::device::sub_group_progress_capabilities< - ext::oneapi::experimental::execution_scope::work_group>>() { - using execution_scope = ext::oneapi::experimental::execution_scope; - using ReturnT = - std::vector; - return device_impl::getProgressGuaranteesUpTo( - device_impl::getHostProgressGuarantee(execution_scope::sub_group, - execution_scope::work_group)); -} - -template <> -inline std::vector -get_device_info_host< - ext::oneapi::experimental::info::device::work_item_progress_capabilities< - ext::oneapi::experimental::execution_scope::root_group>>() { - - using execution_scope = ext::oneapi::experimental::execution_scope; - using ReturnT = - std::vector; - return device_impl::getProgressGuaranteesUpTo( - device_impl::getHostProgressGuarantee(execution_scope::work_item, - execution_scope::root_group)); -} - -template <> -inline std::vector -get_device_info_host< - ext::oneapi::experimental::info::device::work_item_progress_capabilities< - ext::oneapi::experimental::execution_scope::work_group>>() { - using execution_scope = ext::oneapi::experimental::execution_scope; - using ReturnT = - std::vector; - return device_impl::getProgressGuaranteesUpTo( - device_impl::getHostProgressGuarantee(execution_scope::work_item, - execution_scope::work_group)); -} - -template <> -inline std::vector -get_device_info_host< - ext::oneapi::experimental::info::device::work_item_progress_capabilities< - ext::oneapi::experimental::execution_scope::sub_group>>() { - using execution_scope = ext::oneapi::experimental::execution_scope; - using ReturnT = - std::vector; - return device_impl::getProgressGuaranteesUpTo( - device_impl::getHostProgressGuarantee(execution_scope::work_item, - execution_scope::sub_group)); -} - // Returns the list of all progress guarantees that can be requested for // work_groups from the coordination level of root_group when using the device // given by Dev. First it calls getProgressGuarantee to get the strongest From 35b682216afe064e98bf8c6f2c45334d99a5120a Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 22 May 2024 04:23:01 -0700 Subject: [PATCH 04/58] not-buildable: remove is_host from context_impl.* Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/context_impl.cpp | 19 +++++-------------- sycl/source/detail/context_impl.hpp | 1 - 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/sycl/source/detail/context_impl.cpp b/sycl/source/detail/context_impl.cpp index c2124456dae24..87663c4e10775 100644 --- a/sycl/source/detail/context_impl.cpp +++ b/sycl/source/detail/context_impl.cpp @@ -34,7 +34,6 @@ context_impl::context_impl(const device &Device, async_handler AsyncHandler, MContext(nullptr), MPlatform(detail::getSyclObjImpl(Device.get_platform())), MPropList(PropList), - MHostContext(detail::getSyclObjImpl(Device)->is_host()), MSupportBufferLocationByDevices(NotChecked) { MKernelProgramCache.setContextPtr(this); } @@ -43,7 +42,7 @@ context_impl::context_impl(const std::vector Devices, async_handler AsyncHandler, const property_list &PropList) : MOwnedByRuntime(true), MAsyncHandler(AsyncHandler), MDevices(Devices), - MContext(nullptr), MPlatform(), MPropList(PropList), MHostContext(false), + MContext(nullptr), MPlatform(), MPropList(PropList), MSupportBufferLocationByDevices(NotChecked) { MPlatform = detail::getSyclObjImpl(MDevices[0].get_platform()); std::vector DeviceIds; @@ -88,7 +87,7 @@ context_impl::context_impl(sycl::detail::pi::PiContext PiContext, bool OwnedByRuntime) : MOwnedByRuntime(OwnedByRuntime), MAsyncHandler(AsyncHandler), MDevices(DeviceList), MContext(PiContext), MPlatform(), - MHostContext(false), MSupportBufferLocationByDevices(NotChecked) { + MSupportBufferLocationByDevices(NotChecked) { if (!MDevices.empty()) { MPlatform = detail::getSyclObjImpl(MDevices[0].get_platform()); } else { @@ -132,18 +131,11 @@ context_impl::context_impl(sycl::detail::pi::PiContext PiContext, } cl_context context_impl::get() const { - if (MHostContext) { - throw invalid_object_error( - "This instance of context doesn't support OpenCL interoperability.", - PI_ERROR_INVALID_CONTEXT); - } // TODO catch an exception and put it to list of asynchronous exceptions getPlugin()->call(MContext); return pi::cast(MContext); } -bool context_impl::is_host() const { return MHostContext; } - context_impl::~context_impl() { // Free all events associated with the initialization of device globals. for (auto &DeviceGlobalInitializer : MDeviceGlobalInitializers) @@ -159,10 +151,9 @@ context_impl::~context_impl() { assert(LibProg.second && "Null program must not be kept in the cache"); getPlugin()->call(LibProg.second); } - if (!MHostContext) { - // TODO catch an exception and put it to list of asynchronous exceptions - getPlugin()->call_nocheck(MContext); - } + + // TODO catch an exception and put it to list of asynchronous exceptions + getPlugin()->call_nocheck(MContext); } const async_handler &context_impl::get_async_handler() const { diff --git a/sycl/source/detail/context_impl.hpp b/sycl/source/detail/context_impl.hpp index a1e383f721e31..af20236fc4b23 100644 --- a/sycl/source/detail/context_impl.hpp +++ b/sycl/source/detail/context_impl.hpp @@ -272,7 +272,6 @@ class context_impl { sycl::detail::pi::PiContext MContext; PlatformImplPtr MPlatform; property_list MPropList; - bool MHostContext; CachedLibProgramsT MCachedLibPrograms; std::mutex MCachedLibProgramsMutex; mutable KernelProgramCache MKernelProgramCache; From 77c749c6ea54b35b5324bfe163460279b3039930 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 22 May 2024 04:29:12 -0700 Subject: [PATCH 05/58] not-buildable: remove is_host from event_impl.* Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/event_impl.cpp | 91 +++++++++++++------------------ sycl/source/detail/event_impl.hpp | 3 +- 2 files changed, 38 insertions(+), 56 deletions(-) diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index 7442cd4ccfe7a..e187be3563f5b 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -37,20 +37,9 @@ void event_impl::ensureContextInitialized() { if (MIsContextInitialized) return; - if (MHostEvent) { - QueueImplPtr HostQueue = Scheduler::getInstance().getDefaultHostQueue(); - this->setContextImpl(detail::getSyclObjImpl(HostQueue->get_context())); - } else { - const device SyclDevice; - this->setContextImpl(detail::queue_impl::getDefaultOrNew( - detail::getSyclObjImpl(SyclDevice))); - } -} - -bool event_impl::is_host() { - // Treat all devices that don't support interoperability as host devices to - // avoid attempts to call method get on such events. - return MHostEvent; + const device SyclDevice; + this->setContextImpl(detail::queue_impl::getDefaultOrNew( + detail::getSyclObjImpl(SyclDevice))); } event_impl::~event_impl() { @@ -59,7 +48,7 @@ event_impl::~event_impl() { } void event_impl::waitInternal(bool *Success) { - if (!MHostEvent && MEvent) { + if (MEvent) { // Wait for the native event sycl::detail::pi::PiResult Err = getPlugin()->call_nocheck(1, &MEvent); @@ -92,7 +81,7 @@ void event_impl::waitInternal(bool *Success) { } void event_impl::setComplete() { - if (MHostEvent || !MEvent) { + if (!MEvent) { { std::unique_lock lock(MMutex); #ifndef NDEBUG @@ -137,7 +126,6 @@ const PluginPtr &event_impl::getPlugin() { void event_impl::setStateIncomplete() { MState = HES_NotComplete; } void event_impl::setContextImpl(const ContextImplPtr &Context) { - MHostEvent = Context->is_host(); MContext = Context; MIsContextInitialized = true; } @@ -145,7 +133,7 @@ void event_impl::setContextImpl(const ContextImplPtr &Context) { event_impl::event_impl(sycl::detail::pi::PiEvent Event, const context &SyclContext) : MIsContextInitialized(true), MEvent(Event), - MContext(detail::getSyclObjImpl(SyclContext)), MHostEvent(false), + MContext(detail::getSyclObjImpl(SyclContext)), MIsFlushed(true), MState(HES_Complete) { if (MContext->is_host()) { @@ -317,7 +305,7 @@ event_impl::get_profiling_info() { // made by forcing the re-sync of submit time to start time is less than // 0.5ms. These timing values were obtained empirically using an integrated // Intel GPU). - if (MEventFromSubmittedExecCommandBuffer && !MHostEvent && MEvent) { + if (MEventFromSubmittedExecCommandBuffer && MEvent) { uint64_t StartTime = get_event_profiling_info( this->getHandleRef(), this->getPlugin()); @@ -336,20 +324,19 @@ event_impl::get_profiling_info() { if (isNOP() && MSubmitTime) return MSubmitTime; - if (!MHostEvent) { - if (MEvent) { - auto StartTime = - get_event_profiling_info( + if (MEvent) { + auto StartTime = + get_event_profiling_info( + this->getHandleRef(), this->getPlugin()); + if (!MFallbackProfiling) { + return StartTime; + } else { + auto DeviceBaseTime = + get_event_profiling_info( this->getHandleRef(), this->getPlugin()); - if (!MFallbackProfiling) { - return StartTime; - } else { - auto DeviceBaseTime = - get_event_profiling_info( - this->getHandleRef(), this->getPlugin()); - return MHostBaseTime - DeviceBaseTime + StartTime; - } + return MHostBaseTime - DeviceBaseTime + StartTime; } + return 0; } if (!MHostProfilingInfo) @@ -368,19 +355,17 @@ uint64_t event_impl::get_profiling_info() { if (isNOP() && MSubmitTime) return MSubmitTime; - if (!MHostEvent) { - if (MEvent) { - auto EndTime = - get_event_profiling_info( + if (MEvent) { + auto EndTime = + get_event_profiling_info( + this->getHandleRef(), this->getPlugin()); + if (!MFallbackProfiling) { + return EndTime; + } else { + auto DeviceBaseTime = + get_event_profiling_info( this->getHandleRef(), this->getPlugin()); - if (!MFallbackProfiling) { - return EndTime; - } else { - auto DeviceBaseTime = - get_event_profiling_info( - this->getHandleRef(), this->getPlugin()); - return MHostBaseTime - DeviceBaseTime + EndTime; - } + return MHostBaseTime - DeviceBaseTime + EndTime; } return 0; } @@ -393,7 +378,7 @@ uint64_t event_impl::get_profiling_info() { } template <> uint32_t event_impl::get_info() { - if (!MHostEvent && MEvent) { + if (MEvent) { return get_event_info(this->getHandleRef(), this->getPlugin()); } @@ -406,17 +391,15 @@ event_impl::get_info() { if (MState == HES_Discarded) return info::event_command_status::ext_oneapi_unknown; - if (!MHostEvent) { - // Command is enqueued and PiEvent is ready - if (MEvent) - return get_event_info( - this->getHandleRef(), this->getPlugin()); - // Command is blocked and not enqueued, PiEvent is not assigned yet - else if (MCommand) - return sycl::info::event_command_status::submitted; - } + // Command is enqueued and PiEvent is ready + if (MEvent) + return get_event_info( + this->getHandleRef(), this->getPlugin()); + // Command is blocked and not enqueued, PiEvent is not assigned yet + else if (MCommand) + return sycl::info::event_command_status::submitted; - return MHostEvent && MState.load() != HES_Complete + return MState.load() != HES_Complete ? sycl::info::event_command_status::submitted : info::event_command_status::complete; } diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index f33c160f9df97..08bb15cff6ff8 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -49,7 +49,7 @@ class event_impl { /// Normally constructs a host event, use std::nullopt to instead instantiate /// a device event. event_impl(std::optional State = HES_Complete) - : MIsInitialized(false), MHostEvent(State), MIsFlushed(true), + : MIsInitialized(false), MIsFlushed(true), MState(State.value_or(HES_Complete)) { // Need to fail in event() constructor if there are problems with the // ONEAPI_DEVICE_SELECTOR. Deferring may lead to conficts with noexcept @@ -364,7 +364,6 @@ class event_impl { uint64_t MSubmitTime = 0; uint64_t MHostBaseTime = 0; ContextImplPtr MContext; - bool MHostEvent = true; std::unique_ptr MHostProfilingInfo; void *MCommand = nullptr; std::weak_ptr MQueue; From 6e7142097db4e014c7a12e576c2af6d124675ed1 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 22 May 2024 04:31:22 -0700 Subject: [PATCH 06/58] not-buildable: update is_host for API objects to be easily removed Signed-off-by: Tikhomirova, Kseniya --- sycl/source/context.cpp | 5 ++--- sycl/source/device.cpp | 5 ++--- sycl/source/event.cpp | 5 ++--- sycl/source/kernel.cpp | 5 ++--- sycl/source/platform.cpp | 6 ++---- sycl/source/queue.cpp | 5 ++--- 6 files changed, 12 insertions(+), 19 deletions(-) diff --git a/sycl/source/context.cpp b/sycl/source/context.cpp index 3273c4f3056c2..c24a6c1ec2079 100644 --- a/sycl/source/context.cpp +++ b/sycl/source/context.cpp @@ -138,9 +138,8 @@ context::get_backend_info() const { cl_context context::get() const { return impl->get(); } bool context::is_host() const { - bool IsHost = impl->is_host(); - assert(!IsHost && "context::is_host should not be called in implementation."); - return IsHost; + assert(true && "context::is_host should not be called in implementation."); + return false; } backend context::get_backend() const noexcept { return impl->getBackend(); } diff --git a/sycl/source/device.cpp b/sycl/source/device.cpp index 70aa37aad26a2..a3a88ebf6636a 100644 --- a/sycl/source/device.cpp +++ b/sycl/source/device.cpp @@ -71,9 +71,8 @@ std::vector device::get_devices(info::device_type deviceType) { cl_device_id device::get() const { return impl->get(); } bool device::is_host() const { - bool IsHost = impl->is_host(); - assert(!IsHost && "device::is_host should not be called in implementation."); - return IsHost; + assert(true && "device::is_host should not be called in implementation."); + return false; } bool device::is_cpu() const { return impl->is_cpu(); } diff --git a/sycl/source/event.cpp b/sycl/source/event.cpp index a7bae8055c10b..12b4a7e68164e 100644 --- a/sycl/source/event.cpp +++ b/sycl/source/event.cpp @@ -38,9 +38,8 @@ bool event::operator==(const event &rhs) const { return rhs.impl == impl; } bool event::operator!=(const event &rhs) const { return !(*this == rhs); } bool event::is_host() const { - bool IsHost = impl->is_host(); - assert(!IsHost && "event::is_host should not be called in implementation."); - return IsHost; + assert(true && "event::is_host should not be called in implementation."); + return false; } void event::wait() { impl->wait(impl); } diff --git a/sycl/source/kernel.cpp b/sycl/source/kernel.cpp index ff14c0a879078..bc842f6e596a5 100644 --- a/sycl/source/kernel.cpp +++ b/sycl/source/kernel.cpp @@ -31,9 +31,8 @@ kernel::kernel(cl_kernel ClKernel, const context &SyclContext) cl_kernel kernel::get() const { return impl->get(); } bool kernel::is_host() const { - bool IsHost = impl->is_host(); - assert(!IsHost && "kernel::is_host should not be called in implementation."); - return IsHost; + assert(true && "kernel::is_host should not be called in implementation."); + return false; } context kernel::get_context() const { diff --git a/sycl/source/platform.cpp b/sycl/source/platform.cpp index a2ee714952be9..9a15943213ec6 100644 --- a/sycl/source/platform.cpp +++ b/sycl/source/platform.cpp @@ -41,10 +41,8 @@ bool platform::has_extension(const std::string &ExtensionName) const { } bool platform::is_host() const { - bool IsHost = impl->is_host(); - assert(!IsHost && - "platform::is_host should not be called in implementation."); - return IsHost; + assert(true && "platform::is_host should not be called in implementation."); + return false; } std::vector platform::get_devices(info::device_type DeviceType) const { diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 15d7f11fcb42d..6a66cce267aa1 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -96,9 +96,8 @@ queue::ext_oneapi_get_graph() const { } bool queue::is_host() const { - bool IsHost = impl->is_host(); - assert(!IsHost && "queue::is_host should not be called in implementation."); - return IsHost; + assert(true && "queue::is_host should not be called in implementation."); + return false; } void queue::throw_asynchronous() { impl->throw_asynchronous(); } From 7e5abe966b8ebbfee9e0adcc7ce935cd864c21b8 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 22 May 2024 08:53:47 -0700 Subject: [PATCH 07/58] not-buildable: update most obvious places Signed-off-by: Tikhomirova, Kseniya --- sycl/source/context.cpp | 37 ++++-------- sycl/source/detail/event_impl.cpp | 27 +++------ sycl/source/detail/event_impl.hpp | 13 ++-- sycl/source/detail/scheduler/commands.cpp | 60 +++---------------- sycl/source/detail/scheduler/commands.hpp | 7 +-- .../source/detail/scheduler/graph_builder.cpp | 4 +- sycl/source/detail/scheduler/scheduler.cpp | 24 +------- sycl/source/detail/scheduler/scheduler.hpp | 8 --- sycl/source/handler.cpp | 9 +-- 9 files changed, 39 insertions(+), 150 deletions(-) diff --git a/sycl/source/context.cpp b/sycl/source/context.cpp index c24a6c1ec2079..70b12836fc297 100644 --- a/sycl/source/context.cpp +++ b/sycl/source/context.cpp @@ -56,31 +56,20 @@ context::context(const std::vector &DeviceList, throw invalid_parameter_error("DeviceList is empty.", PI_ERROR_INVALID_VALUE); } - auto NonHostDeviceIter = std::find_if_not( - DeviceList.begin(), DeviceList.end(), [&](const device &CurrentDevice) { - return detail::getSyclObjImpl(CurrentDevice)->is_host(); - }); - if (NonHostDeviceIter == DeviceList.end()) - impl = std::make_shared(DeviceList[0], AsyncHandler, + + const auto &RefPlatform = + detail::getSyclObjImpl(DeviceList[0].get_platform())->getHandleRef(); + if (std::any_of(DeviceList.begin(), DeviceList.end(), + [&](const device &CurrentDevice) { + return (detail::getSyclObjImpl(CurrentDevice.get_platform()) + ->getHandleRef() != RefPlatform); + })) + throw invalid_parameter_error( + "Can't add devices across platforms to a single context.", + PI_ERROR_INVALID_DEVICE); + else + impl = std::make_shared(DeviceList, AsyncHandler, PropList); - else { - const device &NonHostDevice = *NonHostDeviceIter; - const auto &NonHostPlatform = - detail::getSyclObjImpl(NonHostDevice.get_platform())->getHandleRef(); - if (std::any_of(DeviceList.begin(), DeviceList.end(), - [&](const device &CurrentDevice) { - return ( - detail::getSyclObjImpl(CurrentDevice)->is_host() || - (detail::getSyclObjImpl(CurrentDevice.get_platform()) - ->getHandleRef() != NonHostPlatform)); - })) - throw invalid_parameter_error( - "Can't add devices across platforms to a single context.", - PI_ERROR_INVALID_DEVICE); - else - impl = std::make_shared(DeviceList, AsyncHandler, - PropList); - } } context::context(cl_context ClContext, async_handler AsyncHandler) { const auto &Plugin = sycl::detail::pi::getPlugin(); diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index e187be3563f5b..28bb37200392a 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -136,13 +136,6 @@ event_impl::event_impl(sycl::detail::pi::PiEvent Event, MContext(detail::getSyclObjImpl(SyclContext)), MIsFlushed(true), MState(HES_Complete) { - if (MContext->is_host()) { - throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), - "The syclContext must match the OpenCL context " - "associated with the clEvent. " + - codeToString(PI_ERROR_INVALID_CONTEXT)); - } - sycl::detail::pi::PiContext TempContext; getPlugin()->call( MEvent, PI_EVENT_INFO_CONTEXT, sizeof(sycl::detail::pi::PiContext), @@ -162,19 +155,8 @@ event_impl::event_impl(const QueueImplPtr &Queue) { void event_impl::associateWithQueue(const QueueImplPtr &Queue) { MQueue = Queue; - MIsProfilingEnabled = Queue->is_host() || Queue->MIsProfilingEnabled; + MIsProfilingEnabled = Queue->MIsProfilingEnabled; MFallbackProfiling = MIsProfilingEnabled && Queue->isProfilingFallback(); - if (Queue->is_host()) { - MState.store(HES_NotComplete); - if (Queue->has_property()) { - MHostProfilingInfo.reset(new HostProfilingInfo()); - if (!MHostProfilingInfo) - throw sycl::exception(sycl::make_error_code(sycl::errc::runtime), - "Out of host memory " + - codeToString(PI_ERROR_OUT_OF_HOST_MEMORY)); - } - return; - } MState.store(HES_Complete); } @@ -578,6 +560,13 @@ bool event_impl::isCompleted() { info::event_command_status::complete; } +void event_impl::setCommand(void *Cmd) { + MCommand = Cmd; + auto TypedCommand = static_cast(Cmd); + if (TypedCommand) + MIsHostTask = TypedCommand->isHostTask(); +} + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index 08bb15cff6ff8..7c1eb99e3b286 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -68,14 +68,6 @@ class event_impl { event_impl(sycl::detail::pi::PiEvent Event, const context &SyclContext); event_impl(const QueueImplPtr &Queue); - /// Checks if this event is a SYCL host event. - /// - /// All devices that do not support OpenCL interoperability are treated as - /// host device to avoid attempts to call method get on such events. - // - /// \return true if this event is a SYCL host event. - bool is_host(); - /// Waits for the event. /// /// Self is needed in order to pass shared_ptr to Scheduler. @@ -177,7 +169,7 @@ class event_impl { /// Scheduler mutex must be locked in write mode when this is called. /// /// @param Command is a generic pointer to Command object instance. - void setCommand(void *Command) { MCommand = Command; } + void setCommand(void *Command); /// Returns host profiling information. /// @@ -345,6 +337,8 @@ class event_impl { void setEnqueued() { MIsEnqueued = true; } + bool isHost() { return MIsHostTask; } + protected: // When instrumentation is enabled emits trace event for event wait begin and // returns the telemetry event generated for the wait @@ -412,6 +406,7 @@ class event_impl { std::shared_ptr Context); std::atomic_bool MIsEnqueued{false}; + bool MIsHostTask{false}; }; } // namespace detail diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index bf7e44062cb5e..0739ac77373b7 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -96,9 +96,7 @@ static std::string demangleKernelName(std::string Name) { return Name; } #endif static std::string deviceToString(device Device) { - if (getSyclObjImpl(Device)->is_host()) - return "HOST"; - else if (Device.is_cpu()) + if (Device.is_cpu()) return "CPU"; else if (Device.is_gpu()) return "GPU"; @@ -144,10 +142,7 @@ void applyFuncOnFilteredArgs( #ifdef XPTI_ENABLE_INSTRUMENTATION static size_t deviceToID(const device &Device) { - if (getSyclObjImpl(Device)->is_host()) - return 0; - else - return reinterpret_cast(getSyclObjImpl(Device)->getHandleRef()); + return reinterpret_cast(getSyclObjImpl(Device)->getHandleRef()); } #endif @@ -265,7 +260,7 @@ std::vector Command::getPiEventsBlocking( // (which is set lazily) calling getContextImpl() would set that // context, which we wish to avoid as it is expensive. // Skip host task and NOP events also. - if (!EventImpl->isContextInitialized() || EventImpl->is_host() || + if (!EventImpl->isContextInitialized() || EventImpl->isHost() || EventImpl->isNOP()) continue; // In this path nullptr native event means that the command has not been @@ -455,40 +450,9 @@ void Command::waitForEvents(QueueImplPtr Queue, std::vector &EventImpls, sycl::detail::pi::PiEvent &Event) { if (!EventImpls.empty()) { - if (Queue->is_host()) { - // Host queue can wait for events from different contexts, i.e. it may - // contain events with different contexts in its MPreparedDepsEvents. - // OpenCL 2.1 spec says that clWaitForEvents will return - // CL_INVALID_CONTEXT if events specified in the list do not belong to - // the same context. Thus we split all the events into per-context map. - // An example. We have two queues for the same CPU device: Q1, Q2. Thus - // we will have two different contexts for the same CPU device: C1, C2. - // Also we have default host queue. This queue is accessible via - // Scheduler. Now, let's assume we have three different events: E1(C1), - // E2(C1), E3(C2). The command's MPreparedDepsEvents will contain all - // three events (E1, E2, E3). Now, if piEventsWait is called for all - // three events we'll experience failure with CL_INVALID_CONTEXT 'cause - // these events refer to different contexts. - std::map> - RequiredEventsPerContext; - - for (const EventImplPtr &Event : EventImpls) { - ContextImplPtr Context = Event->getContextImpl(); - assert(Context.get() && - "Only non-host events are expected to be waited for here"); - RequiredEventsPerContext[Context.get()].push_back(Event); - } - - for (auto &CtxWithEvents : RequiredEventsPerContext) { - std::vector RawEvents = - getPiEvents(CtxWithEvents.second); - CtxWithEvents.first->getPlugin()->call( - RawEvents.size(), RawEvents.data()); - } - } else { #ifndef NDEBUG for (const EventImplPtr &Event : EventImpls) - assert(Event->getContextImpl().get() && + assert(!Event->isHost() && "Only non-host events are expected to be waited for here"); #endif @@ -501,7 +465,6 @@ void Command::waitForEvents(QueueImplPtr Queue, MEvent->setHostEnqueueTime(); Plugin->call( Queue->getHandleRef(), RawEvents.size(), &RawEvents[0], &Event); - } } } @@ -714,7 +677,7 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, // enqueued // (e.g. alloca). Note that we can't check the pi event to make that // distinction since the command might still be unenqueued at this point. - bool PiEventExpected = (!DepEvent->is_host() && DepEvent->isInitialized()); + bool PiEventExpected = (!DepEvent->isHost() && DepEvent->isInitialized()); if (auto *DepCmd = static_cast(DepEvent->getCommand())) PiEventExpected &= DepCmd->producesPiEvent(); @@ -885,7 +848,7 @@ bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, else { MEvent->setEnqueued(); if (MShouldCompleteEventIfPossible && - (MEvent->is_host() || MEvent->getHandleRef() == nullptr)) + (MEvent->isHost() || MEvent->getHandleRef() == nullptr)) MEvent->setComplete(); // Consider the command is successfully enqueued if return code is @@ -3172,8 +3135,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { std::vector Events = Barrier->MEventsWaitWithBarrier; std::vector PiEvents = getPiEventsBlocking(Events); - if (MQueue->getDeviceImplPtr()->is_host() || PiEvents.empty()) { - // NOP for host device. + if (PiEvents.empty()) { // If Events is empty, then the barrier has no effect. return PI_SUCCESS; } @@ -3244,10 +3206,6 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { } case CG::CGTYPE::SemaphoreWait: { CGSemaphoreWait *SemWait = (CGSemaphoreWait *)MCommandGroup.get(); - if (MQueue->getDeviceImplPtr()->is_host()) { - // NOP for host device. - return PI_SUCCESS; - } const detail::PluginPtr &Plugin = MQueue->getPlugin(); Plugin->call( @@ -3258,10 +3216,6 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { } case CG::CGTYPE::SemaphoreSignal: { CGSemaphoreSignal *SemSignal = (CGSemaphoreSignal *)MCommandGroup.get(); - if (MQueue->getDeviceImplPtr()->is_host()) { - // NOP for host device. - return PI_SUCCESS; - } const detail::PluginPtr &Plugin = MQueue->getPlugin(); Plugin->call( diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 8ba0cceee9e6a..89cabd134a7e1 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -377,10 +377,9 @@ class Command { std::string MSubmissionFileName; std::string MSubmissionFunctionName; - // This flag allows to control whether host event should be set complete - // after successfull enqueue of command. Event is considered as host event if - // either it's is_host() return true or there is no backend representation - // of event (i.e. getHandleRef() return reference to nullptr value). + // This flag allows to control whether event should be set complete + // after successfull enqueue of command. Event is considered as "host" event if + // there is no backend representation of event (i.e. getHandleRef() return reference to nullptr value). // By default the flag is set to true due to most of host operations are // synchronous. The only asynchronous operation currently is host-task. bool MShouldCompleteEventIfPossible = true; diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index f0c5dc670aa05..196232b95d734 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -50,9 +50,7 @@ static bool doOverlap(const Requirement *LHS, const Requirement *RHS) { } static bool sameCtx(const ContextImplPtr &LHS, const ContextImplPtr &RHS) { - // Consider two different host contexts to be the same to avoid additional - // allocation on the host - return LHS == RHS || (LHS->is_host() && RHS->is_host()); + return LHS == RHS; } /// Checks if current requirement is requirement for sub buffer. diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index 7b6c837131658..0b061a86dbc62 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -105,14 +105,6 @@ EventImplPtr Scheduler::addCG( auto *CGExecKernelPtr = static_cast(CommandGroup.get()); Streams = CGExecKernelPtr->getStreams(); CGExecKernelPtr->clearStreams(); - // Stream's flush buffer memory is mainly initialized in stream's __init - // method. However, this method is not available on host device. - // Initializing stream's flush buffer on the host side in a separate task. - if (Queue->is_host()) { - for (const StreamImplPtr &Stream : Streams) { - Stream->initStreamHost(Queue); - } - } } std::vector> AuxiliaryResources; AuxiliaryResources = CommandGroup->getAuxiliaryResources(); @@ -394,18 +386,6 @@ void Scheduler::enqueueUnblockedCommands( } } -Scheduler::Scheduler() { - sycl::device HostDevice = - createSyclObjFromImpl(device_impl::getHostDeviceImpl()); - sycl::context HostContext{HostDevice}; - DefaultHostQueue = QueueImplPtr( - new queue_impl(detail::getSyclObjImpl(HostDevice), - detail::getSyclObjImpl(HostContext), /*AsyncHandler=*/{}, - /*PropList=*/{sycl::property::queue::enable_profiling()})); -} - -Scheduler::~Scheduler() { DefaultHostQueue.reset(); } - void Scheduler::releaseResources(BlockingT Blocking) { // There might be some commands scheduled for post enqueue cleanup that // haven't been freed because of the graph mutex being locked at the time, @@ -726,11 +706,11 @@ bool CheckEventReadiness(const ContextImplPtr &Context, // their context, which we wish to avoid as it is expensive. // NOP events also don't represent actual dependencies. if ((!SyclEventImplPtr->isContextInitialized() && - !SyclEventImplPtr->is_host()) || + !SyclEventImplPtr->isHost()) || SyclEventImplPtr->isNOP()) { return true; } - if (SyclEventImplPtr->is_host()) { + if (SyclEventImplPtr->isHost()) { return SyclEventImplPtr->isCompleted(); } // Cross-context dependencies can't be passed to the backend directly. diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 09437928f1d32..6fa95cb4a4a54 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -450,10 +450,6 @@ class Scheduler { /// \return true if an instance of the scheduler object exists. static bool isInstanceAlive(); - QueueImplPtr getDefaultHostQueue() { return DefaultHostQueue; } - - const QueueImplPtr &getDefaultHostQueue() const { return DefaultHostQueue; } - static MemObjRecord *getMemObjRecord(const Requirement *const Req); void deferMemObjRelease(const std::shared_ptr &MemObj); @@ -468,8 +464,6 @@ class Scheduler { bool isInFusionMode(QueueIdT Queue); - Scheduler(); - ~Scheduler(); void releaseResources(BlockingT Blocking = BlockingT::BLOCKING); bool isDeferredMemObjectsEmpty(); @@ -966,8 +960,6 @@ class Scheduler { MAuxiliaryResources; std::mutex MAuxiliaryResourcesMutex; - QueueImplPtr DefaultHostQueue; - friend class Command; friend class DispatchHostTask; friend class queue_impl; diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 8223c9330814e..749ab6750df5e 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -273,12 +273,6 @@ event handler::finalize() { detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent, xpti::trace_task_begin, nullptr); #endif - if (MQueue->is_host()) { - MHostKernel->call(MNDRDesc, (NewEvent) - ? NewEvent->getHostProfilingInfo() - : nullptr); - Result = PI_SUCCESS; - } else { if (MQueue->getDeviceImplPtr()->getBackend() == backend::ext_intel_esimd_emulator) { // Capture the host timestamp for profiling (queue time) @@ -313,7 +307,6 @@ event handler::finalize() { MKernelName.c_str(), RawEvents, NewEvent, nullptr, MImpl->MKernelCacheConfig, MImpl->MKernelIsCooperative); } - } #ifdef XPTI_ENABLE_INSTRUMENTATION // Emit signal only when event is created if (NewEvent != nullptr) { @@ -351,7 +344,7 @@ event handler::finalize() { if (PI_SUCCESS != EnqueueKernel()) throw runtime_error("Enqueue process failed.", PI_ERROR_INVALID_OPERATION); - else if (NewEvent->is_host() || NewEvent->getHandleRef() == nullptr) + else if (NewEvent->isHost() || NewEvent->getHandleRef() == nullptr) NewEvent->setComplete(); NewEvent->setEnqueued(); From 31a702c1c2ec81aa2430595230761edc75d52dce Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Thu, 23 May 2024 06:33:00 -0700 Subject: [PATCH 08/58] not-buildable: remove is_host from obvious places, part2 Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/backend_impl.hpp | 1 - sycl/source/detail/bindless_images.cpp | 4 - sycl/source/detail/context_impl.cpp | 10 --- sycl/source/detail/device_impl.hpp | 6 +- sycl/source/detail/filter_selector_impl.cpp | 3 - sycl/source/detail/helpers.cpp | 4 +- sycl/source/detail/kernel_impl.cpp | 4 +- sycl/source/detail/kernel_impl.hpp | 22 ------ sycl/source/detail/kernel_info.hpp | 73 ------------------- sycl/source/detail/platform_impl.cpp | 17 +---- sycl/source/detail/platform_impl.hpp | 12 --- sycl/source/detail/platform_info.hpp | 30 -------- sycl/source/detail/program_impl.cpp | 46 +++--------- sycl/source/detail/program_impl.hpp | 6 -- sycl/source/detail/queue_impl.cpp | 35 ++++----- sycl/source/detail/queue_impl.hpp | 61 +++++----------- sycl/source/detail/scheduler/commands.cpp | 20 +---- .../source/detail/scheduler/graph_builder.cpp | 2 +- 18 files changed, 56 insertions(+), 300 deletions(-) diff --git a/sycl/source/detail/backend_impl.hpp b/sycl/source/detail/backend_impl.hpp index ca23ceb48815c..0c160ed1920c4 100644 --- a/sycl/source/detail/backend_impl.hpp +++ b/sycl/source/detail/backend_impl.hpp @@ -15,7 +15,6 @@ inline namespace _V1 { namespace detail { template backend getImplBackend(const T &Impl) { - assert(!Impl->is_host() && "Cannot get the backend for host."); return Impl->getContextImplPtr()->getBackend(); } diff --git a/sycl/source/detail/bindless_images.cpp b/sycl/source/detail/bindless_images.cpp index 174fe087ede4f..fbf90e692598e 100644 --- a/sycl/source/detail/bindless_images.cpp +++ b/sycl/source/detail/bindless_images.cpp @@ -746,10 +746,6 @@ __SYCL_EXPORT void *pitched_alloc_device(size_t *resultPitch, std::shared_ptr CtxImpl = sycl::detail::getSyclObjImpl(syclContext); - if (CtxImpl->is_host()) { - throw sycl::exception(sycl::make_error_code(sycl::errc::memory_allocation), - "Cannot allocate pitched memory on host!"); - } pi_context PiContext = CtxImpl->getHandleRef(); const sycl::detail::PluginPtr &Plugin = CtxImpl->getPlugin(); diff --git a/sycl/source/detail/context_impl.cpp b/sycl/source/detail/context_impl.cpp index 87663c4e10775..0c79ed2f70462 100644 --- a/sycl/source/detail/context_impl.cpp +++ b/sycl/source/detail/context_impl.cpp @@ -162,8 +162,6 @@ const async_handler &context_impl::get_async_handler() const { template <> uint32_t context_impl::get_info() const { - if (is_host()) - return 0; return get_context_info(this->getHandleRef(), this->getPlugin()); } @@ -183,8 +181,6 @@ context_impl::get_info() sycl::memory_order::relaxed, sycl::memory_order::acquire, sycl::memory_order::release, sycl::memory_order::acq_rel, sycl::memory_order::seq_cst}; - if (is_host()) - return CapabilityList; GetCapabilitiesIntersectionSet< sycl::memory_order, info::device::atomic_memory_order_capabilities>( @@ -200,8 +196,6 @@ context_impl::get_info() sycl::memory_scope::work_item, sycl::memory_scope::sub_group, sycl::memory_scope::work_group, sycl::memory_scope::device, sycl::memory_scope::system}; - if (is_host()) - return CapabilityList; GetCapabilitiesIntersectionSet< sycl::memory_scope, info::device::atomic_memory_scope_capabilities>( @@ -216,8 +210,6 @@ context_impl::get_info() const { sycl::memory_order::relaxed, sycl::memory_order::acquire, sycl::memory_order::release, sycl::memory_order::acq_rel, sycl::memory_order::seq_cst}; - if (is_host()) - return CapabilityList; GetCapabilitiesIntersectionSet( @@ -232,8 +224,6 @@ context_impl::get_info() const { sycl::memory_scope::work_item, sycl::memory_scope::sub_group, sycl::memory_scope::work_group, sycl::memory_scope::device, sycl::memory_scope::system}; - if (is_host()) - return CapabilityList; GetCapabilitiesIntersectionSet( diff --git a/sycl/source/detail/device_impl.hpp b/sycl/source/detail/device_impl.hpp index 2526647152892..efec017d372f5 100644 --- a/sycl/source/detail/device_impl.hpp +++ b/sycl/source/detail/device_impl.hpp @@ -80,18 +80,18 @@ class device_impl { /// Check if device is a CPU device /// /// \return true if SYCL device is a CPU device - bool is_cpu() const { return (!is_host() && (MType == PI_DEVICE_TYPE_CPU)); } + bool is_cpu() const { return MType == PI_DEVICE_TYPE_CPU; } /// Check if device is a GPU device /// /// \return true if SYCL device is a GPU device - bool is_gpu() const { return (!is_host() && (MType == PI_DEVICE_TYPE_GPU)); } + bool is_gpu() const { return MType == PI_DEVICE_TYPE_GPU; } /// Check if device is an accelerator device /// /// \return true if SYCL device is an accelerator device bool is_accelerator() const { - return (!is_host() && (MType == PI_DEVICE_TYPE_ACC)); + return MType == PI_DEVICE_TYPE_ACC; } /// Return device type diff --git a/sycl/source/detail/filter_selector_impl.cpp b/sycl/source/detail/filter_selector_impl.cpp index 4b5f8e836ee6d..0043622d62483 100644 --- a/sycl/source/detail/filter_selector_impl.cpp +++ b/sycl/source/detail/filter_selector_impl.cpp @@ -99,9 +99,6 @@ filter_selector_impl::filter_selector_impl(const std::string &Input) } int filter_selector_impl::operator()(const device &Dev) const { - assert(!sycl::detail::getSyclObjImpl(Dev)->is_host() && - "filter_selector_impl should not be used with host."); - int Score = REJECT_DEVICE_SCORE; for (auto &Filter : mFilters) { diff --git a/sycl/source/detail/helpers.cpp b/sycl/source/detail/helpers.cpp index 1bdb2ddbd4697..75c6fd72b8fd0 100644 --- a/sycl/source/detail/helpers.cpp +++ b/sycl/source/detail/helpers.cpp @@ -32,7 +32,7 @@ getOrWaitEvents(std::vector DepEvents, ContextImplPtr Context) { // (which is set lazily) calling getContextImpl() would set that // context, which we wish to avoid as it is expensive. if ((!SyclEventImplPtr->isContextInitialized() && - !SyclEventImplPtr->is_host()) || + !SyclEventImplPtr->isHost()) || SyclEventImplPtr->isNOP()) { continue; } @@ -41,7 +41,7 @@ getOrWaitEvents(std::vector DepEvents, ContextImplPtr Context) { bool NoPiEvent = SyclEventImplPtr->MCommand && !static_cast(SyclEventImplPtr->MCommand)->producesPiEvent(); - if (SyclEventImplPtr->is_host() || + if (SyclEventImplPtr->isHost() || SyclEventImplPtr->getContextImpl() != Context || NoPiEvent) { // Call wait, because the command for the event might not have been // enqueued when kernel fusion is happening. diff --git a/sycl/source/detail/kernel_impl.cpp b/sycl/source/detail/kernel_impl.cpp index 9c5a1851cd3b1..b4ab6b232eef9 100644 --- a/sycl/source/detail/kernel_impl.cpp +++ b/sycl/source/detail/kernel_impl.cpp @@ -76,9 +76,7 @@ kernel_impl::kernel_impl(ContextImplPtr Context, ProgramImplPtr ProgramImpl) kernel_impl::~kernel_impl() { // TODO catch an exception and put it to list of asynchronous exceptions - if (!is_host()) { - getPlugin()->call(MKernel); - } + getPlugin()->call(MKernel); } bool kernel_impl::isCreatedFromSource() const { diff --git a/sycl/source/detail/kernel_impl.hpp b/sycl/source/detail/kernel_impl.hpp index 1e56e6da4dc53..1a1542d0d409b 100644 --- a/sycl/source/detail/kernel_impl.hpp +++ b/sycl/source/detail/kernel_impl.hpp @@ -103,20 +103,10 @@ class kernel_impl { /// /// \return a valid cl_kernel instance cl_kernel get() const { - if (is_host()) { - throw invalid_object_error( - "This instance of kernel doesn't support OpenCL interoperability.", - PI_ERROR_INVALID_KERNEL); - } getPlugin()->call(MKernel); return pi::cast(MKernel); } - /// Check if the associated SYCL context is a SYCL host context. - /// - /// \return true if this SYCL kernel is a host kernel. - bool is_host() const { return MContext->is_host(); } - const PluginPtr &getPlugin() const { return MContext->getPlugin(); } /// Query information from the kernel object using the info::kernel_info @@ -217,11 +207,6 @@ template inline typename Param::return_type kernel_impl::get_info() const { static_assert(is_kernel_info_desc::value, "Invalid kernel information descriptor"); - if (is_host()) { - // TODO implement - assert(0 && "Not implemented"); - } - if constexpr (std::is_same_v) checkIfValidForNumArgsInfoQuery(); @@ -248,9 +233,6 @@ kernel_impl::get_info(const device &Device) const { "is a built-in kernel."); } - if (is_host()) { - return get_kernel_device_specific_info_host(Device); - } return get_kernel_device_specific_info( this->getHandleRef(), getSyclObjImpl(Device)->getHandleRef(), getPlugin()); @@ -260,10 +242,6 @@ template inline typename Param::return_type kernel_impl::get_info(const device &Device, const sycl::range<3> &WGSize) const { - if (is_host()) { - throw runtime_error("Sub-group feature is not supported on HOST device.", - PI_ERROR_INVALID_DEVICE); - } return get_kernel_device_specific_info_with_input( this->getHandleRef(), getSyclObjImpl(Device)->getHandleRef(), WGSize, getPlugin()); diff --git a/sycl/source/detail/kernel_info.hpp b/sycl/source/detail/kernel_info.hpp index 12256158eed49..79c0f73c952de 100644 --- a/sycl/source/detail/kernel_info.hpp +++ b/sycl/source/detail/kernel_info.hpp @@ -137,79 +137,6 @@ uint32_t get_kernel_device_specific_info_with_input( return Result; } -template -inline typename Param::return_type -get_kernel_device_specific_info_host(const sycl::device &Device) = delete; - -template <> -inline sycl::range<3> get_kernel_device_specific_info_host< - info::kernel_device_specific::global_work_size>(const sycl::device &) { - throw invalid_object_error("This instance of kernel is a host instance", - PI_ERROR_INVALID_KERNEL); -} - -template <> -inline size_t get_kernel_device_specific_info_host< - info::kernel_device_specific::work_group_size>(const sycl::device &Dev) { - return Dev.get_info(); -} - -template <> -inline sycl::range<3> get_kernel_device_specific_info_host< - info::kernel_device_specific::compile_work_group_size>( - const sycl::device &) { - return {0, 0, 0}; -} - -template <> -inline size_t get_kernel_device_specific_info_host< - info::kernel_device_specific::preferred_work_group_size_multiple>( - const sycl::device &Dev) { - return get_kernel_device_specific_info_host< - info::kernel_device_specific::work_group_size>(Dev); -} - -template <> -inline size_t get_kernel_device_specific_info_host< - info::kernel_device_specific::private_mem_size>(const sycl::device &) { - return 0; -} - -template <> -inline uint32_t get_kernel_device_specific_info_host< - info::kernel_device_specific::ext_codeplay_num_regs>(const sycl::device &) { - return 0; -} - -template <> -inline uint32_t get_kernel_device_specific_info_host< - info::kernel_device_specific::max_num_sub_groups>(const sycl::device &) { - throw invalid_object_error("This instance of kernel is a host instance", - PI_ERROR_INVALID_KERNEL); -} - -template <> -inline uint32_t get_kernel_device_specific_info_host< - info::kernel_device_specific::max_sub_group_size>(const sycl::device &) { - throw invalid_object_error("This instance of kernel is a host instance", - PI_ERROR_INVALID_KERNEL); -} - -template <> -inline uint32_t get_kernel_device_specific_info_host< - info::kernel_device_specific::compile_num_sub_groups>( - const sycl::device &) { - throw invalid_object_error("This instance of kernel is a host instance", - PI_ERROR_INVALID_KERNEL); -} - -template <> -inline uint32_t get_kernel_device_specific_info_host< - info::kernel_device_specific::compile_sub_group_size>( - const sycl::device &) { - throw invalid_object_error("This instance of kernel is a host instance", - PI_ERROR_INVALID_KERNEL); -} } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/platform_impl.cpp b/sycl/source/detail/platform_impl.cpp index 9700fde466803..2caf958bb842b 100644 --- a/sycl/source/detail/platform_impl.cpp +++ b/sycl/source/detail/platform_impl.cpp @@ -79,9 +79,6 @@ static bool IsBannedPlatform(platform Platform) { // is disabled as well. // auto IsMatchingOpenCL = [](platform Platform, const std::string_view name) { - if (getSyclObjImpl(Platform)->is_host()) - return false; - const bool HasNameMatch = Platform.get_info().find( name) != std::string::npos; const auto Backend = detail::getSyclObjImpl(Platform)->getBackend(); @@ -466,15 +463,9 @@ platform_impl::get_devices(info::device_type DeviceType) const { ods_target_list *OdsTargetList = SYCLConfig::get(); - if (is_host() && (DeviceType == info::device_type::host || - DeviceType == info::device_type::all)) { - Res.push_back( - createSyclObjFromImpl(device_impl::getHostDeviceImpl())); - } - // If any DeviceType other than host was requested for host platform, // an empty vector will be returned. - if (is_host() || DeviceType == info::device_type::host) + if (DeviceType == info::device_type::host) return Res; pi_uint32 NumDevices = 0; @@ -556,9 +547,6 @@ platform_impl::get_devices(info::device_type DeviceType) const { } bool platform_impl::has_extension(const std::string &ExtensionName) const { - if (is_host()) - return false; - std::string AllExtensionNames = get_platform_info_string_impl( MPlatform, getPlugin(), detail::PiInfoCode::value); @@ -580,9 +568,6 @@ pi_native_handle platform_impl::getNative() const { template typename Param::return_type platform_impl::get_info() const { - if (is_host()) - return get_platform_info_host(); - return get_platform_info(this->getHandleRef(), getPlugin()); } diff --git a/sycl/source/detail/platform_impl.hpp b/sycl/source/detail/platform_impl.hpp index 0bb8d1ab77e2f..e13bd0a3a1b31 100644 --- a/sycl/source/detail/platform_impl.hpp +++ b/sycl/source/detail/platform_impl.hpp @@ -89,9 +89,6 @@ class platform_impl { template typename Param::return_type get_backend_info() const; - /// \return true if this SYCL platform is a host platform. - bool is_host() const { return MHostPlatform; }; - /// Returns the backend of this platform. backend getBackend(void) const { return MBackend; } @@ -107,11 +104,6 @@ class platform_impl { /// \return an instance of OpenCL cl_platform_id. cl_platform_id get() const { - if (is_host()) { - throw invalid_object_error( - "This instance of platform doesn't support OpenCL interoperability.", - PI_ERROR_INVALID_PLATFORM); - } return pi::cast(MPlatform); } @@ -123,10 +115,6 @@ class platform_impl { /// /// \return a raw plug-in platform handle. const sycl::detail::pi::PiPlatform &getHandleRef() const { - if (is_host()) - throw invalid_object_error("This instance of platform is a host instance", - PI_ERROR_INVALID_PLATFORM); - return MPlatform; } diff --git a/sycl/source/detail/platform_info.hpp b/sycl/source/detail/platform_info.hpp index 42c41b5063cf5..70bcd626024d9 100644 --- a/sycl/source/detail/platform_info.hpp +++ b/sycl/source/detail/platform_info.hpp @@ -59,36 +59,6 @@ get_platform_info(sycl::detail::pi::PiPlatform Plt, const PluginPtr &Plugin) { return split_string(Result, ' '); } -// Host platform information methods -template -inline typename Param::return_type get_platform_info_host() = delete; - -template <> -inline std::string get_platform_info_host() { - return "FULL PROFILE"; -} - -template <> -inline std::string get_platform_info_host() { - return "1.2"; -} - -template <> inline std::string get_platform_info_host() { - return "SYCL host platform"; -} - -template <> -inline std::string get_platform_info_host() { - return ""; -} - -template <> -inline std::vector -get_platform_info_host() { - // TODO update when appropriate - return {}; -} - } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/program_impl.cpp b/sycl/source/detail/program_impl.cpp index d65f3163b961f..584b2487f5dee 100644 --- a/sycl/source/detail/program_impl.cpp +++ b/sycl/source/detail/program_impl.cpp @@ -72,9 +72,8 @@ program_impl::program_impl( } MDevices = ProgramList[0]->MDevices; std::vector DevicesSorted; - if (!is_host()) { - DevicesSorted = sort_devices_by_cl_device_id(MDevices); - } + DevicesSorted = sort_devices_by_cl_device_id(MDevices); + check_device_feature_support(MDevices); std::list> Locks; for (const auto &Prg : ProgramList) { @@ -85,18 +84,16 @@ program_impl::program_impl( "Not all programs are associated with the same context", PI_ERROR_INVALID_PROGRAM); } - if (!is_host()) { - std::vector PrgDevicesSorted = - sort_devices_by_cl_device_id(Prg->MDevices); - if (PrgDevicesSorted != DevicesSorted) { - throw invalid_object_error( - "Not all programs are associated with the same devices", - PI_ERROR_INVALID_PROGRAM); - } + + std::vector PrgDevicesSorted = + sort_devices_by_cl_device_id(Prg->MDevices); + if (PrgDevicesSorted != DevicesSorted) { + throw invalid_object_error( + "Not all programs are associated with the same devices", + PI_ERROR_INVALID_PROGRAM); } } - if (!is_host()) { std::vector Devices(get_pi_devices()); std::vector Programs; bool NonInterOpToLink = false; @@ -113,7 +110,6 @@ program_impl::program_impl( LinkOptions.c_str(), Programs.size(), Programs.data(), nullptr, nullptr, &MProgram); Plugin->checkPiResult(Err); - } } program_impl::program_impl(ContextImplPtr Context, @@ -208,7 +204,7 @@ program_impl::program_impl(ContextImplPtr Context, program_impl::~program_impl() { // TODO catch an exception and put it to list of asynchronous exceptions - if (!is_host() && MProgram != nullptr) { + if (MProgram != nullptr) { const PluginPtr &Plugin = getPlugin(); Plugin->call(MProgram); } @@ -216,11 +212,6 @@ program_impl::~program_impl() { cl_program program_impl::get() const { throw_if_state_is(program_state::none); - if (is_host()) { - throw invalid_object_error( - "This instance of program doesn't support OpenCL interoperability.", - PI_ERROR_INVALID_PROGRAM); - } getPlugin()->call(MProgram); return pi::cast(MProgram); } @@ -229,19 +220,16 @@ void program_impl::compile_with_kernel_name(std::string KernelName, std::string CompileOptions) { std::lock_guard Lock(MMutex); throw_if_state_is_not(program_state::none); - if (!is_host()) { create_pi_program_with_kernel_name( KernelName, /*JITCompilationIsRequired=*/(!CompileOptions.empty())); compile(CompileOptions); - } MState = program_state::compiled; } void program_impl::link(std::string LinkOptions) { std::lock_guard Lock(MMutex); throw_if_state_is_not(program_state::compiled); - if (!is_host()) { check_device_feature_support(MDevices); std::vector Devices(get_pi_devices()); const PluginPtr &Plugin = getPlugin(); @@ -263,16 +251,12 @@ void program_impl::link(std::string LinkOptions) { Plugin->checkPiResult(Err); MLinkOptions = LinkOptions; MBuildOptions = LinkOptions; - } MState = program_state::linked; } bool program_impl::has_kernel(std::string KernelName, bool IsCreatedFromSource) const { throw_if_state_is(program_state::none); - if (is_host()) { - return !IsCreatedFromSource; - } std::vector Devices(get_pi_devices()); pi_uint64 function_ptr; @@ -299,14 +283,6 @@ kernel program_impl::get_kernel(std::string KernelName, std::shared_ptr PtrToSelf, bool IsCreatedFromSource) const { throw_if_state_is(program_state::none); - if (is_host()) { - if (IsCreatedFromSource) - throw invalid_object_error("This instance of program is a host instance", - PI_ERROR_INVALID_PROGRAM); - - return createSyclObjFromImpl( - std::make_shared(MContext, PtrToSelf)); - } auto [Kernel, ArgMask] = get_pi_kernel_arg_mask_pair(KernelName); return createSyclObjFromImpl(std::make_shared( Kernel, MContext, PtrToSelf, IsCreatedFromSource, nullptr, ArgMask)); @@ -314,8 +290,6 @@ kernel program_impl::get_kernel(std::string KernelName, std::vector> program_impl::get_binaries() const { throw_if_state_is(program_state::none); - if (is_host()) - return {}; std::vector> Result; const PluginPtr &Plugin = getPlugin(); diff --git a/sycl/source/detail/program_impl.hpp b/sycl/source/detail/program_impl.hpp index 32a0c7fd38bfe..1fa8767774961 100644 --- a/sycl/source/detail/program_impl.hpp +++ b/sycl/source/detail/program_impl.hpp @@ -134,9 +134,6 @@ class program_impl { /// not retained before return. const sycl::detail::pi::PiProgram &getHandleRef() const { return MProgram; } - /// \return true if this SYCL program is a host program. - bool is_host() const { return MContext->is_host(); } - /// Compiles the SYCL kernel function into the encapsulated raw program. /// /// The kernel function is defined by its name. This member function @@ -215,14 +212,11 @@ class program_impl { /// \return the SYCL context that this program was constructed with. context get_context() const { - if (is_host()) - return context(); return createSyclObjFromImpl(MContext); } /// \return the Plugin associated with the context of this program. const PluginPtr &getPlugin() const { - assert(!is_host() && "Plugin is not available for Host."); return MContext->getPlugin(); } diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 05c579f78a405..2c7876ea14c08 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -42,10 +42,9 @@ getPIEvents(const std::vector &DepEvents) { template <> uint32_t queue_impl::get_info() const { sycl::detail::pi::PiResult result = PI_SUCCESS; - if (!is_host()) - getPlugin()->call( - MQueues[0], PI_QUEUE_INFO_REFERENCE_COUNT, sizeof(result), &result, - nullptr); + getPlugin()->call( + MQueues[0], PI_QUEUE_INFO_REFERENCE_COUNT, sizeof(result), &result, + nullptr); return result; } @@ -142,8 +141,7 @@ event queue_impl::memset(const std::shared_ptr &Self, SYCL_STREAM_NAME, "memory_transfer_node"); PrepareNotify.addMetadata([&](auto TEvent) { xpti::addMetadata(TEvent, "sycl_device", - reinterpret_cast( - MDevice->is_host() ? 0 : MDevice->getHandleRef())); + reinterpret_cast(MDevice->getHandleRef())); xpti::addMetadata(TEvent, "memory_ptr", reinterpret_cast(Ptr)); xpti::addMetadata(TEvent, "value_set", Value); xpti::addMetadata(TEvent, "memory_size", Count); @@ -190,8 +188,7 @@ event queue_impl::memcpy(const std::shared_ptr &Self, SYCL_STREAM_NAME, "memory_transfer_node"); PrepareNotify.addMetadata([&](auto TEvent) { xpti::addMetadata(TEvent, "sycl_device", - reinterpret_cast( - MDevice->is_host() ? 0 : MDevice->getHandleRef())); + reinterpret_cast(MDevice->getHandleRef())); xpti::addMetadata(TEvent, "src_memory_ptr", reinterpret_cast(Src)); xpti::addMetadata(TEvent, "dest_memory_ptr", reinterpret_cast(Dest)); @@ -430,9 +427,7 @@ void *queue_impl::instrumentationProlog(const detail::code_location &CodeLoc, if (WaitEvent) { device D = get_device(); std::string DevStr; - if (getSyclObjImpl(D)->is_host()) - DevStr = "HOST"; - else if (D.is_cpu()) + if (D.is_cpu()) DevStr = "CPU"; else if (D.is_gpu()) DevStr = "GPU"; @@ -588,14 +583,12 @@ bool queue_impl::ext_oneapi_empty() const { } // Check the status of the backend queue if this is not a host queue. - if (!is_host()) { - pi_bool IsReady = false; - getPlugin()->call( - MQueues[0], PI_EXT_ONEAPI_QUEUE_INFO_EMPTY, sizeof(pi_bool), &IsReady, - nullptr); - if (!IsReady) - return false; - } + pi_bool IsReady = false; + getPlugin()->call( + MQueues[0], PI_EXT_ONEAPI_QUEUE_INFO_EMPTY, sizeof(pi_bool), &IsReady, + nullptr); + if (!IsReady) + return false; // We may have events like host tasks which are not submitted to the backend // queue so we need to get their status separately. @@ -609,7 +602,7 @@ bool queue_impl::ext_oneapi_empty() const { EventImplWeakPtrIt != MEventsWeak.end(); ++EventImplWeakPtrIt) if (std::shared_ptr EventImplSharedPtr = EventImplWeakPtrIt->lock()) - if (EventImplSharedPtr->is_host() && + if (EventImplSharedPtr->isHost() && EventImplSharedPtr ->get_info() != info::event_command_status::complete) @@ -641,7 +634,7 @@ void queue_impl::revisitUnenqueuedCommandsState( std::remove_if( Deps.UnenqueuedCmdEvents.begin(), Deps.UnenqueuedCmdEvents.end(), [](const EventImplPtr &CommandEvent) { - return (CommandEvent->is_host() ? CommandEvent->isCompleted() + return (CommandEvent->isHost() ? CommandEvent->isCompleted() : CommandEvent->isEnqueued()); }), Deps.UnenqueuedCmdEvents.end()); diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index dff24ad1dfec1..c205b5916f302 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -106,13 +106,12 @@ class queue_impl { queue_impl(const DeviceImplPtr &Device, const ContextImplPtr &Context, const async_handler &AsyncHandler, const property_list &PropList) : MDevice(Device), MContext(Context), MAsyncHandler(AsyncHandler), - MPropList(PropList), MHostQueue(MDevice->is_host()), + MPropList(PropList), MIsInorder(has_property()), MDiscardEvents( has_property()), MIsProfilingEnabled(has_property()), - MSupportsDiscardingPiEvents(MDiscardEvents && - (MHostQueue ? true : MIsInorder)), + MSupportsDiscardingPiEvents(MDiscardEvents && MIsInorder)), MQueueID{ MNextAvailableQueueID.fetch_add(1, std::memory_order_relaxed)} { if (has_property()) { @@ -124,8 +123,7 @@ class queue_impl { if (MDevice->has(aspect::queue_profiling)) { // When piGetDeviceAndHostTimer is not supported, compute the // profiling time OpenCL version < 2.1 case - if (!getDeviceImplPtr()->is_host() && - !getDeviceImplPtr()->isGetDeviceAndHostTimerSupported()) + if (!getDeviceImplPtr()->isGetDeviceAndHostTimerSupported()) MFallbackProfiling = true; } else { throw sycl::exception(make_error_code(errc::feature_not_supported), @@ -154,7 +152,7 @@ class queue_impl { "Cannot enable fusion if device does not support fusion"); } if (!Context->isDeviceValid(Device)) { - if (!Context->is_host() && Context->getBackend() == backend::opencl) + if (Context->getBackend() == backend::opencl) throw sycl::invalid_object_error( "Queue cannot be constructed with the given context and device " "since the device is not a member of the context (descendants of " @@ -166,13 +164,12 @@ class queue_impl { "descendant of its member.", PI_ERROR_INVALID_DEVICE); } - if (!MHostQueue) { - const QueueOrder QOrder = - MIsInorder ? QueueOrder::Ordered : QueueOrder::OOO; - MQueues.push_back(createQueue(QOrder)); - // This section is the second part of the instrumentation that uses the - // tracepoint information and notifies - } + + const QueueOrder QOrder = + MIsInorder ? QueueOrder::Ordered : QueueOrder::OOO; + MQueues.push_back(createQueue(QOrder)); + // This section is the second part of the instrumentation that uses the + // tracepoint information and notifies // We enable XPTI tracing events using the TLS mechanism; if the code // location data is available, then the tracing data will be rich. @@ -198,13 +195,11 @@ class queue_impl { MDevice->getDeviceName()); xpti::addMetadata( TEvent, "sycl_device", - reinterpret_cast( - MDevice->is_host() ? 0 : MDevice->getHandleRef())); + reinterpret_cast(MDevice->getHandleRef())); } xpti::addMetadata(TEvent, "is_inorder", MIsInorder); xpti::addMetadata(TEvent, "queue_id", MQueueID); - if (!MHostQueue) - xpti::addMetadata(TEvent, "queue_handle", + xpti::addMetadata(TEvent, "queue_handle", reinterpret_cast(getHandleRef())); }); // Also publish to TLS @@ -263,13 +258,11 @@ class queue_impl { MDevice->getDeviceName()); xpti::addMetadata( TEvent, "sycl_device", - reinterpret_cast( - MDevice->is_host() ? 0 : MDevice->getHandleRef())); + reinterpret_cast(MDevice->getHandleRef())); } xpti::addMetadata(TEvent, "is_inorder", MIsInorder); xpti::addMetadata(TEvent, "queue_id", MQueueID); - if (!MHostQueue) - xpti::addMetadata(TEvent, "queue_handle", getHandleRef()); + xpti::addMetadata(TEvent, "queue_handle", getHandleRef()); }); // Also publish to TLS before notification xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, MQueueID); @@ -287,13 +280,12 @@ class queue_impl { /// \param AsyncHandler is a SYCL asynchronous exception handler. queue_impl(sycl::detail::pi::PiQueue PiQueue, const ContextImplPtr &Context, const async_handler &AsyncHandler) - : MContext(Context), MAsyncHandler(AsyncHandler), MHostQueue(false), + : MContext(Context), MAsyncHandler(AsyncHandler), MIsInorder(has_property()), MDiscardEvents( has_property()), MIsProfilingEnabled(has_property()), - MSupportsDiscardingPiEvents(MDiscardEvents && - (MHostQueue ? true : MIsInorder)), + MSupportsDiscardingPiEvents(MDiscardEvents && MIsInorder)), MQueueID{ MNextAvailableQueueID.fetch_add(1, std::memory_order_relaxed)} { queue_impl_interop(PiQueue); @@ -309,13 +301,11 @@ class queue_impl { queue_impl(sycl::detail::pi::PiQueue PiQueue, const ContextImplPtr &Context, const async_handler &AsyncHandler, const property_list &PropList) : MContext(Context), MAsyncHandler(AsyncHandler), MPropList(PropList), - MHostQueue(false), MIsInorder(has_property()), MDiscardEvents( has_property()), MIsProfilingEnabled(has_property()), - MSupportsDiscardingPiEvents(MDiscardEvents && - (MHostQueue ? true : MIsInorder)) { + MSupportsDiscardingPiEvents(MDiscardEvents && MIsInorder)) { queue_impl_interop(PiQueue); } @@ -336,19 +326,12 @@ class queue_impl { } #endif throw_asynchronous(); - if (!MHostQueue) { - cleanup_fusion_cmd(); - getPlugin()->call(MQueues[0]); - } + cleanup_fusion_cmd(); + getPlugin()->call(MQueues[0]); } /// \return an OpenCL interoperability queue handle. cl_command_queue get() { - if (MHostQueue) { - throw invalid_object_error( - "This instance of queue doesn't support OpenCL interoperability", - PI_ERROR_INVALID_QUEUE); - } getPlugin()->call(MQueues[0]); return pi::cast(MQueues[0]); } @@ -367,9 +350,6 @@ class queue_impl { /// \return an associated SYCL device. device get_device() const { return createSyclObjFromImpl(MDevice); } - /// \return true if this queue is a SYCL host queue. - bool is_host() const { return MHostQueue; } - /// \return true if this queue has discard_events support. bool supportsDiscardingPiEvents() const { return MSupportsDiscardingPiEvents; @@ -859,7 +839,7 @@ class queue_impl { "function objects should use the sycl::handler API instead."); } - handler Handler(Self, PrimaryQueue, SecondaryQueue, MHostQueue); + handler Handler(Self, PrimaryQueue, SecondaryQueue); Handler.saveCodeLoc(Loc); PreventSubmit = true; try { @@ -969,7 +949,6 @@ class queue_impl { /// Iterator through MQueues. size_t MNextQueueIdx = 0; - const bool MHostQueue = false; /// Indicates that a native out-of-order queue could not be created and we /// need to emulate it with multiple native in-order queues. bool MEmulateOOO = false; diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 0739ac77373b7..d6c41f39e9942 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2246,7 +2246,7 @@ void SetArgBasedOnType( const PluginPtr &Plugin, sycl::detail::pi::PiKernel Kernel, const std::shared_ptr &DeviceImageImpl, const std::function &getMemAllocationFunc, - const sycl::context &Context, bool IsHost, detail::ArgDesc &Arg, + const sycl::context &Context, detail::ArgDesc &Arg, size_t NextTrueIndex) { switch (Arg.MType) { case kernel_param_kind_t::kind_stream: @@ -2300,13 +2300,6 @@ void SetArgBasedOnType( break; } case kernel_param_kind_t::kind_specialization_constants_buffer: { - if (IsHost) { - throw sycl::exception( - sycl::make_error_code(sycl::errc::feature_not_supported), - "SYCL2020 specialization constants are not yet supported on host " - "device " + - codeToString(PI_ERROR_INVALID_OPERATION)); - } assert(DeviceImageImpl != nullptr); sycl::detail::pi::PiMem SpecConstsBuffer = DeviceImageImpl->get_spec_const_buffer_ref(); @@ -2343,7 +2336,7 @@ static pi_result SetKernelParamsAndLaunch( auto setFunc = [&Plugin, Kernel, &DeviceImageImpl, &getMemAllocationFunc, &Queue](detail::ArgDesc &Arg, size_t NextTrueIndex) { SetArgBasedOnType(Plugin, Kernel, DeviceImageImpl, getMemAllocationFunc, - Queue->get_context(), Queue->is_host(), Arg, + Queue->get_context(), Arg, NextTrueIndex); }; @@ -2940,8 +2933,8 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { NDRDescT &NDRDesc = ExecKernel->MNDRDesc; std::vector &Args = ExecKernel->MArgs; - if (MQueue->is_host() || (MQueue->getDeviceImplPtr()->getBackend() == - backend::ext_intel_esimd_emulator)) { + if (MQueue->getDeviceImplPtr()->getBackend() == + backend::ext_intel_esimd_emulator) { for (ArgDesc &Arg : Args) if (kernel_param_kind_t::kind_accessor == Arg.MType) { Requirement *Req = (Requirement *)(Arg.MPtr); @@ -2954,10 +2947,6 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { Plugin->call(RawEvents.size(), &RawEvents[0]); } - if (MQueue->is_host()) { - ExecKernel->MHostKernel->call(NDRDesc, - getEvent()->getHostProfilingInfo()); - } else { assert(MQueue->getDeviceImplPtr()->getBackend() == backend::ext_intel_esimd_emulator); if (MEvent != nullptr) @@ -2967,7 +2956,6 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { reinterpret_cast(ExecKernel->MHostKernel->getPtr()), NDRDesc.Dims, &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0], &NDRDesc.LocalSize[0], 0, nullptr, nullptr); - } return PI_SUCCESS; } diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 196232b95d734..d1b57182d78ff 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -678,7 +678,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::findAllocaForReq( static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx) { if (const char *HUMConfig = SYCLConfig::get()) { if (std::strcmp(HUMConfig, "0") == 0) - return Ctx->is_host(); + return false; if (std::strcmp(HUMConfig, "1") == 0) return true; } From fa08c2b3314604af314406fb73bcaf33e669f04a Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Mon, 27 May 2024 02:12:53 -0700 Subject: [PATCH 09/58] non-buildable: remove is_host from obvious places Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/context_impl.hpp | 7 +---- sycl/source/detail/device_impl.cpp | 8 ++--- sycl/source/detail/usm/usm_impl.cpp | 47 ----------------------------- 3 files changed, 3 insertions(+), 59 deletions(-) diff --git a/sycl/source/detail/context_impl.hpp b/sycl/source/detail/context_impl.hpp index af20236fc4b23..203242ee40077 100644 --- a/sycl/source/detail/context_impl.hpp +++ b/sycl/source/detail/context_impl.hpp @@ -97,11 +97,6 @@ class context_impl { /// \return an instance of OpenCL cl_context. cl_context get() const; - /// Checks if this context is a host context. - /// - /// \return true if this context is a host context. - bool is_host() const; - /// Gets asynchronous exception handler. /// /// \return an instance of SYCL async_handler. @@ -182,7 +177,7 @@ class context_impl { // OpenCL does not support using descendants of context members within that // context yet. // TODO remove once this limitation is lifted - if (!is_host() && Device->getBackend() == backend::opencl) + if (Device->getBackend() == backend::opencl) return hasDevice(Device); while (!hasDevice(Device)) { diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp index 2e87300425c20..c677b9165d71f 100644 --- a/sycl/source/detail/device_impl.cpp +++ b/sycl/source/detail/device_impl.cpp @@ -327,8 +327,6 @@ bool device_impl::has(aspect Aspect) const { size_t return_size = 0; switch (Aspect) { - case aspect::host: - return is_host(); case aspect::cpu: return is_cpu(); case aspect::gpu: @@ -369,16 +367,14 @@ bool device_impl::has(aspect Aspect) const { case aspect::ext_intel_mem_channel: return get_info(); case aspect::usm_atomic_host_allocations: - return is_host() || - (get_device_info_impl:: get(MPlatform->getDeviceImpl(MDevice)) & PI_USM_CONCURRENT_ATOMIC_ACCESS); case aspect::usm_shared_allocations: return get_info(); case aspect::usm_atomic_shared_allocations: - return is_host() || - (get_device_info_impl:: get(MPlatform->getDeviceImpl(MDevice)) & PI_USM_CONCURRENT_ATOMIC_ACCESS); diff --git a/sycl/source/detail/usm/usm_impl.cpp b/sycl/source/detail/usm/usm_impl.cpp index ecf63bc63e427..753c27d5f678d 100755 --- a/sycl/source/detail/usm/usm_impl.cpp +++ b/sycl/source/detail/usm/usm_impl.cpp @@ -73,20 +73,6 @@ void *alignedAllocHost(size_t Alignment, size_t Size, const context &Ctxt, return nullptr; std::shared_ptr CtxImpl = detail::getSyclObjImpl(Ctxt); - if (CtxImpl->is_host()) { - if (!Alignment) { - // worst case default - Alignment = 128; - } - - aligned_allocator Alloc(Alignment); - try { - RetVal = Alloc.allocate(Size); - } catch (const std::bad_alloc &) { - // Conform with Specification behavior - RetVal = nullptr; - } - } else { pi_context C = CtxImpl->getHandleRef(); const PluginPtr &Plugin = CtxImpl->getPlugin(); pi_result Error = PI_ERROR_INVALID_VALUE; @@ -128,7 +114,6 @@ void *alignedAllocHost(size_t Alignment, size_t Size, const context &Ctxt, // The spec wants a nullptr returned, not an exception. if (Error != PI_SUCCESS) return nullptr; - } #ifdef XPTI_ENABLE_INSTRUMENTATION xpti::addMetadata(PrepareNotify.traceEvent(), "memory_ptr", reinterpret_cast(RetVal)); @@ -154,24 +139,6 @@ void *alignedAllocInternal(size_t Alignment, size_t Size, if (Size == 0) return nullptr; - if (CtxImpl->is_host()) { - if (Kind == alloc::unknown) { - RetVal = nullptr; - } else { - if (!Alignment) { - // worst case default - Alignment = 128; - } - - aligned_allocator Alloc(Alignment); - try { - RetVal = Alloc.allocate(Size); - } catch (const std::bad_alloc &) { - // Conform with Specification behavior - RetVal = nullptr; - } - } - } else { pi_context C = CtxImpl->getHandleRef(); const PluginPtr &Plugin = CtxImpl->getPlugin(); pi_result Error = PI_ERROR_INVALID_VALUE; @@ -245,7 +212,6 @@ void *alignedAllocInternal(size_t Alignment, size_t Size, // The spec wants a nullptr returned, not an exception. if (Error != PI_SUCCESS) return nullptr; - } return RetVal; } @@ -284,14 +250,9 @@ void *alignedAlloc(size_t Alignment, size_t Size, const context &Ctxt, void freeInternal(void *Ptr, const context_impl *CtxImpl) { if (Ptr == nullptr) return; - if (CtxImpl->is_host()) { - // need to use alignedFree here for Windows - detail::OSUtil::alignedFree(Ptr); - } else { pi_context C = CtxImpl->getHandleRef(); const PluginPtr &Plugin = CtxImpl->getPlugin(); Plugin->call(C, Ptr); - } } void free(void *Ptr, const context &Ctxt, @@ -578,10 +539,6 @@ alloc get_pointer_type(const void *Ptr, const context &Ctxt) { std::shared_ptr CtxImpl = detail::getSyclObjImpl(Ctxt); - // Everything on a host device is just system malloc so call it host - if (CtxImpl->is_host()) - return alloc::host; - pi_context PICtx = CtxImpl->getHandleRef(); pi_usm_type AllocTy; @@ -631,10 +588,6 @@ device get_pointer_device(const void *Ptr, const context &Ctxt) { std::shared_ptr CtxImpl = detail::getSyclObjImpl(Ctxt); - // Just return the host device in the host context - if (CtxImpl->is_host()) - return Ctxt.get_devices()[0]; - // Check if ptr is a host allocation if (get_pointer_type(Ptr, Ctxt) == alloc::host) { auto Devs = CtxImpl->getDevices(); From d021de9af53da859390f6519730dd363b9b2d4bb Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Mon, 27 May 2024 06:03:56 -0700 Subject: [PATCH 10/58] not-buildable: remove is_host in simple places Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/buffer_impl.cpp | 3 --- sycl/source/detail/memory_manager.cpp | 27 +-------------------------- sycl/source/detail/memory_manager.hpp | 4 ---- sycl/source/detail/queue_impl.cpp | 11 ++++------- sycl/source/detail/sycl_mem_obj_t.cpp | 23 ++--------------------- 5 files changed, 7 insertions(+), 61 deletions(-) diff --git a/sycl/source/detail/buffer_impl.cpp b/sycl/source/detail/buffer_impl.cpp index 835c732a40bf9..d7d77205b162c 100644 --- a/sycl/source/detail/buffer_impl.cpp +++ b/sycl/source/detail/buffer_impl.cpp @@ -25,9 +25,6 @@ void *buffer_impl::allocateMem(ContextImplPtr Context, bool InitFromUserData, bool HostPtrReadOnly = false; BaseT::determineHostPtr(Context, InitFromUserData, HostPtr, HostPtrReadOnly); - assert(!(nullptr == HostPtr && BaseT::useHostPtr() && Context->is_host()) && - "Internal error. Allocating memory on the host " - "while having use_host_ptr property"); return MemoryManager::allocateMemBuffer( std::move(Context), this, HostPtr, HostPtrReadOnly, BaseT::getSizeInBytes(), BaseT::MInteropEvent, BaseT::MInteropContext, diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index 840f95ea7a643..f4e42363cb6e1 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -266,11 +266,6 @@ void MemoryManager::releaseMemObj(ContextImplPtr TargetContext, return; } - if (TargetContext->is_host()) { - MemObj->releaseHostMem(MemAllocation); - return; - } - const PluginPtr &Plugin = TargetContext->getPlugin(); memReleaseHelper(Plugin, pi::cast(MemAllocation)); } @@ -288,20 +283,6 @@ void *MemoryManager::allocate(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, OutEvent); } -void *MemoryManager::allocateHostMemory(SYCLMemObjI *MemObj, void *UserPtr, - bool HostPtrReadOnly, size_t Size, - const sycl::property_list &) { - std::ignore = HostPtrReadOnly; - std::ignore = Size; - - // Can return user pointer directly if it is not a nullptr. - if (UserPtr) - return UserPtr; - - return MemObj->allocateHostMem(); - ; -} - void *MemoryManager::allocateInteropMemObject( ContextImplPtr TargetContext, void *UserPtr, const EventImplPtr &InteropEvent, const ContextImplPtr &InteropContext, @@ -398,10 +379,7 @@ void *MemoryManager::allocateMemBuffer( const ContextImplPtr &InteropContext, const sycl::property_list &PropsList, sycl::detail::pi::PiEvent &OutEventToWait) { void *MemPtr; - if (TargetContext->is_host()) - MemPtr = - allocateHostMemory(MemObj, UserPtr, HostPtrReadOnly, Size, PropsList); - else if (UserPtr && InteropContext) + if (UserPtr && InteropContext) MemPtr = allocateInteropMemObject(TargetContext, UserPtr, InteropEvent, InteropContext, PropsList, OutEventToWait); @@ -420,9 +398,6 @@ void *MemoryManager::allocateMemImage( const EventImplPtr &InteropEvent, const ContextImplPtr &InteropContext, const sycl::property_list &PropsList, sycl::detail::pi::PiEvent &OutEventToWait) { - if (TargetContext->is_host()) - return allocateHostMemory(MemObj, UserPtr, HostPtrReadOnly, Size, - PropsList); if (UserPtr && InteropContext) return allocateInteropMemObject(TargetContext, UserPtr, InteropEvent, InteropContext, PropsList, OutEventToWait); diff --git a/sycl/source/detail/memory_manager.hpp b/sycl/source/detail/memory_manager.hpp index 1d2800bf9dadc..7be17898bc0d9 100644 --- a/sycl/source/detail/memory_manager.hpp +++ b/sycl/source/detail/memory_manager.hpp @@ -85,10 +85,6 @@ class __SYCL_EXPORT MemoryManager { static void releaseMemObj(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, void *MemAllocation, void *UserPtr); - static void *allocateHostMemory(SYCLMemObjI *MemObj, void *UserPtr, - bool HostPtrReadOnly, size_t Size, - const sycl::property_list &PropsList); - static void * allocateInteropMemObject(ContextImplPtr TargetContext, void *UserPtr, const EventImplPtr &InteropEvent, diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 2c7876ea14c08..bba423df61b60 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -283,12 +283,12 @@ void queue_impl::addEvent(const event &Event) { // if there is no command on the event, we cannot track it with MEventsWeak // as that will leave it with no owner. Track in MEventsShared only if we're // unable to call piQueueFinish during wait. - if (is_host() || MEmulateOOO) + if (Event->isHost() || MEmulateOOO) addSharedEvent(Event); } // As long as the queue supports piQueueFinish we only need to store events // for unenqueued commands and host tasks. - else if (is_host() || MEmulateOOO || EImpl->getHandleRef() == nullptr) { + else if (Event->isHost() || MEmulateOOO || EImpl->getHandleRef() == nullptr) { std::weak_ptr EventWeakPtr{EImpl}; std::lock_guard Lock{MMutex}; MEventsWeak.push_back(std::move(EventWeakPtr)); @@ -299,7 +299,7 @@ void queue_impl::addEvent(const event &Event) { /// but some events have no other owner. In this case, /// addSharedEvent will have the queue track the events via a shared pointer. void queue_impl::addSharedEvent(const event &Event) { - assert(is_host() || MEmulateOOO); + assert(MEmulateOOO); std::lock_guard Lock(MMutex); // Events stored in MEventsShared are not released anywhere else aside from // calls to queue::wait/wait_and_throw, which a user application might not @@ -369,9 +369,6 @@ event queue_impl::submitMemOpHelper(const std::shared_ptr &Self, MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents), &EventImpl->getHandleRef(), EventImpl); - if (MContext->is_host()) - return MDiscardEvents ? createDiscardedEvent() : event(); - if (isInOrder()) { auto &EventToStoreIn = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr : MExtGraphDeps.LastEventPtr; @@ -520,7 +517,7 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { // directly. Otherwise, only wait for unenqueued or host task events, starting // from the latest submitted task in order to minimize total amount of calls, // then handle the rest with piQueueFinish. - const bool SupportsPiFinish = !is_host() && !MEmulateOOO; + const bool SupportsPiFinish = !MEmulateOOO; for (auto EventImplWeakPtrIt = WeakEvents.rbegin(); EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) { if (std::shared_ptr EventImplSharedPtr = diff --git a/sycl/source/detail/sycl_mem_obj_t.cpp b/sycl/source/detail/sycl_mem_obj_t.cpp index bb4c5f4e1441d..87f005fe8ca78 100644 --- a/sycl/source/detail/sycl_mem_obj_t.cpp +++ b/sycl/source/detail/sycl_mem_obj_t.cpp @@ -33,12 +33,6 @@ SYCLMemObjT::SYCLMemObjT(pi_native_handle MemObject, const context &SyclContext, MUserPtr(nullptr), MShadowCopy(nullptr), MUploadDataFunctor(nullptr), MSharedPtrStorage(nullptr), MHostPtrProvided(true), MOwnNativeHandle(OwnNativeHandle) { - if (MInteropContext->is_host()) - throw sycl::invalid_parameter_error( - "Creation of interoperability memory object using host context is " - "not allowed", - PI_ERROR_INVALID_CONTEXT); - sycl::detail::pi::PiContext Context = nullptr; const PluginPtr &Plugin = getPlugin(); @@ -84,12 +78,6 @@ SYCLMemObjT::SYCLMemObjT(pi_native_handle MemObject, const context &SyclContext, MUserPtr(nullptr), MShadowCopy(nullptr), MUploadDataFunctor(nullptr), MSharedPtrStorage(nullptr), MHostPtrProvided(true), MOwnNativeHandle(OwnNativeHandle) { - if (MInteropContext->is_host()) - throw sycl::invalid_parameter_error( - "Creation of interoperability memory object using host context is " - "not allowed", - PI_ERROR_INVALID_CONTEXT); - sycl::detail::pi::PiContext Context = nullptr; const PluginPtr &Plugin = getPlugin(); @@ -191,19 +179,12 @@ void SYCLMemObjT::determineHostPtr(const ContextImplPtr &Context, // The data for the allocation can be provided via either the user pointer // (InitFromUserData, can be read-only) or a runtime-allocated read-write // HostPtr. We can have one of these scenarios: - // 1. The allocation is the first one and on host. InitFromUserData == true. - // 2. The allocation is the first one and isn't on host. InitFromUserData + // 1. The allocation is the first one and isn't on host. InitFromUserData // varies based on unified host memory support and whether or not the data can // be discarded. - // 3. The allocation is not the first one and is on host. InitFromUserData == - // false, HostPtr == nullptr. This can only happen if the allocation command - // is not linked since it would be a no-op otherwise. Attempt to reuse the - // user pointer if it's read-write, but do not copy its contents if it's not. - // 4. The allocation is not the first one and not on host. InitFromUserData == + // 2. The allocation is not the first one and not on host. InitFromUserData == // false, HostPtr is provided if the command is linked. The host pointer is // guaranteed to be reused in this case. - if (Context->is_host() && !MOpenCLInterop && !MHostPtrReadOnly) - InitFromUserData = true; if (InitFromUserData) { assert(!HostPtr && "Cannot init from user data and reuse host ptr provided " From 5b60b90c37d2bc388272eaed40f375403a148e80 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Tue, 28 May 2024 04:26:44 -0700 Subject: [PATCH 11/58] draft Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/memory_manager.cpp | 27 ----------- sycl/source/detail/scheduler/commands.cpp | 21 ++++----- .../source/detail/scheduler/graph_builder.cpp | 46 +++++++++---------- sycl/source/detail/scheduler/scheduler.hpp | 32 +++++++++---- 4 files changed, 55 insertions(+), 71 deletions(-) diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index f4e42363cb6e1..792c1c57bd3f1 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -921,9 +921,6 @@ void MemoryManager::copy_usm(const void *SrcMem, QueueImplPtr SrcQueue, std::vector DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl) { - assert(!SrcQueue->getContextImplPtr()->is_host() && - "Host queue not supported in fill_usm."); - if (!Len) { // no-op, but ensure DepEvents will still be waited on if (!DepEvents.empty()) { if (OutEventImpl != nullptr) @@ -962,9 +959,6 @@ void MemoryManager::fill_usm(void *Mem, QueueImplPtr Queue, size_t Length, std::vector DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl) { - assert(!Queue->getContextImplPtr()->is_host() && - "Host queue not supported in fill_usm."); - if (!Length) { // no-op, but ensure DepEvents will still be waited on if (!DepEvents.empty()) { if (OutEventImpl != nullptr) @@ -1000,9 +994,6 @@ void MemoryManager::prefetch_usm( std::vector DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl) { - assert(!Queue->getContextImplPtr()->is_host() && - "Host queue not supported in prefetch_usm."); - const PluginPtr &Plugin = Queue->getPlugin(); if (OutEventImpl != nullptr) OutEventImpl->setHostEnqueueTime(); @@ -1024,9 +1015,6 @@ void MemoryManager::advise_usm( std::vector /*DepEvents*/, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl) { - assert(!Queue->getContextImplPtr()->is_host() && - "Host queue not supported in advise_usm."); - const PluginPtr &Plugin = Queue->getPlugin(); if (OutEventImpl != nullptr) OutEventImpl->setHostEnqueueTime(); @@ -1049,9 +1037,6 @@ void MemoryManager::copy_2d_usm( std::vector DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl) { - assert(!Queue->getContextImplPtr()->is_host() && - "Host queue not supported in copy_2d_usm."); - if (Width == 0 || Height == 0) { // no-op, but ensure DepEvents will still be waited on if (!DepEvents.empty()) { @@ -1137,9 +1122,6 @@ void MemoryManager::fill_2d_usm( std::vector DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl) { - assert(!Queue->getContextImplPtr()->is_host() && - "Host queue not supported in fill_2d_usm."); - if (Width == 0 || Height == 0) { // no-op, but ensure DepEvents will still be waited on if (!DepEvents.empty()) { @@ -1177,9 +1159,6 @@ void MemoryManager::memset_2d_usm( char Value, std::vector DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl) { - assert(!Queue->getContextImplPtr()->is_host() && - "Host queue not supported in fill_2d_usm."); - if (Width == 0 || Height == 0) { // no-op, but ensure DepEvents will still be waited on if (!DepEvents.empty()) { @@ -1714,8 +1693,6 @@ void MemoryManager::ext_oneapi_prefetch_usm_cmd_buffer( sycl::detail::pi::PiExtCommandBuffer CommandBuffer, void *Mem, size_t Length, std::vector Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint) { - assert(!Context->is_host() && "Host queue not supported in prefetch_usm."); - const PluginPtr &Plugin = Context->getPlugin(); Plugin->call( CommandBuffer, Mem, Length, _pi_usm_migration_flags(0), Deps.size(), @@ -1728,8 +1705,6 @@ void MemoryManager::ext_oneapi_advise_usm_cmd_buffer( size_t Length, pi_mem_advice Advice, std::vector Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint) { - assert(!Context->is_host() && "Host queue not supported in advise_usm."); - const PluginPtr &Plugin = Context->getPlugin(); Plugin->call( CommandBuffer, Mem, Length, Advice, Deps.size(), Deps.data(), @@ -1748,8 +1723,6 @@ void MemoryManager::copy_image_bindless( const std::vector &DepEvents, sycl::detail::pi::PiEvent *OutEvent) { - assert(!Queue->getContextImplPtr()->is_host() && - "Host queue not supported in copy_image_bindless."); assert((Flags == (sycl::detail::pi::PiImageCopyFlags) ext::oneapi::experimental::image_copy_flags::HtoD || Flags == (sycl::detail::pi::PiImageCopyFlags) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index d6c41f39e9942..0a25d7b3ee6c1 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -671,12 +671,9 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, const QueueImplPtr &WorkerQueue = getWorkerQueue(); const ContextImplPtr &WorkerContext = WorkerQueue->getContextImplPtr(); - // 1. Async work is not supported for host device. - // 2. Non-host events can be ignored if they are not fully initialized. - // 3. Some types of commands do not produce PI events after they are - // enqueued - // (e.g. alloca). Note that we can't check the pi event to make that - // distinction since the command might still be unenqueued at this point. + // 1. Non-host events can be ignored if they are not fully initialized. + // 2. Some types of commands do not produce PI events after they are + // enqueued (e.g. alloca). Note that we can't check the pi event to make that distinction since the command might still be unenqueued at this point. bool PiEventExpected = (!DepEvent->isHost() && DepEvent->isInitialized()); if (auto *DepCmd = static_cast(DepEvent->getCommand())) PiEventExpected &= DepCmd->producesPiEvent(); @@ -692,11 +689,13 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, ContextImplPtr DepEventContext = DepEvent->getContextImpl(); // If contexts don't match we'll connect them using host task - if (DepEventContext != WorkerContext && !WorkerContext->is_host()) { + if (DepEventContext == WorkerContext) + MPreparedDepsEvents.push_back(std::move(DepEvent)); + else + { Scheduler::GraphBuilder &GB = Scheduler::getInstance().MGraphBuilder; ConnectionCmd = GB.connectDepEvent(this, DepEvent, Dep, ToCleanUp); - } else - MPreparedDepsEvents.push_back(std::move(DepEvent)); + } return ConnectionCmd; } @@ -3106,10 +3105,6 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { return PI_SUCCESS; } case CG::CGTYPE::Barrier: { - if (MQueue->getDeviceImplPtr()->is_host()) { - // NOP for host device. - return PI_SUCCESS; - } const PluginPtr &Plugin = MQueue->getPlugin(); if (MEvent != nullptr) MEvent->setHostEnqueueTime(); diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index d1b57182d78ff..bbb6d8de12f98 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -243,7 +243,7 @@ MemObjRecord *Scheduler::GraphBuilder::getOrInsertMemObjRecord( getOrCreateAllocaForReq(MemObject->MRecord.get(), Req, InteropQueuePtr, ToEnqueue); } else - MemObject->MRecord.reset(new MemObjRecord{Queue->getContextImplPtr(), + MemObject->MRecord.reset(new MemObjRecord{Queue ? Queue->getContextImplPtr() : nullptr, LeafLimit, AllocateDependency}); MMemObjs.push_back(MemObject); @@ -317,7 +317,7 @@ static Command *insertMapUnmapForLinkedCmds(AllocaCommandBase *AllocaCmdSrc, assert(AllocaCmdSrc->MIsActive && "Expected source alloca command to be active"); - if (AllocaCmdSrc->getQueue()->is_host()) { + if (!AllocaCmdSrc->getQueue()) { UnMapMemObject *UnMapCmd = new UnMapMemObject( AllocaCmdDst, *AllocaCmdDst->getRequirement(), &AllocaCmdSrc->MMemAllocation, AllocaCmdDst->getQueue()); @@ -427,7 +427,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove( Command *Scheduler::GraphBuilder::remapMemoryObject( MemObjRecord *Record, Requirement *Req, AllocaCommandBase *HostAllocaCmd, std::vector &ToEnqueue) { - assert(HostAllocaCmd->getQueue()->is_host() && + assert(!HostAllocaCmd->getQueue() && "Host alloca command expected"); assert(HostAllocaCmd->MIsActive && "Active alloca command expected"); @@ -525,16 +525,14 @@ Scheduler::GraphBuilder::addHostAccessor(Requirement *Req, auto SYCLMemObj = static_cast(Req->MSYCLMemObj); SYCLMemObj->handleWriteAccessorCreation(); } - - const QueueImplPtr &HostQueue = getInstance().getDefaultHostQueue(); - - MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req, ToEnqueue); + // Host accessor is not attached to any queue so no QueueImplPtr object to be sent to getOrInsertMemObjRecord. + MemObjRecord *Record = getOrInsertMemObjRecord(nullptr, Req, ToEnqueue); if (MPrintOptionsArray[BeforeAddHostAcc]) printGraphAsDot("before_addHostAccessor"); markModifiedIfWrite(Record, Req); AllocaCommandBase *HostAllocaCmd = - getOrCreateAllocaForReq(Record, Req, HostQueue, ToEnqueue); + getOrCreateAllocaForReq(Record, Req, nullptr, ToEnqueue); if (sameCtx(HostAllocaCmd->getQueue()->getContextImplPtr(), Record->MCurContext)) { @@ -682,6 +680,10 @@ static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx) { if (std::strcmp(HUMConfig, "1") == 0) return true; } + // host task & host accessor is covered with no device context but provide required support. + if (Ctx == nullptr) + return true; + for (const device &Device : Ctx->getDevices()) { if (!Device.get_info()) return false; @@ -696,9 +698,9 @@ static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx) { AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( MemObjRecord *Record, const Requirement *Req, const QueueImplPtr &Queue, std::vector &ToEnqueue) { - + auto Context = Queue != nullptr ? Queue->getContextImplPtr() : nullptr; AllocaCommandBase *AllocaCmd = findAllocaForReq( - Record, Req, Queue->getContextImplPtr(), /*AllowConst=*/false); + Record, Req, Context, /*AllowConst=*/false); if (!AllocaCmd) { std::vector ToCleanUp; @@ -729,7 +731,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( // the user pointer is read-only is still not handled: it leads to // unnecessary copy on devices with unified host memory support. const bool HostUnifiedMemory = - checkHostUnifiedMemory(Queue->getContextImplPtr()); + checkHostUnifiedMemory(Context); SYCLMemObjI *MemObj = Req->MSYCLMemObj; const bool InitFromUserData = Record->MAllocaCommands.empty() && (HostUnifiedMemory || MemObj->isInterop()); @@ -745,16 +747,14 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( // There's no need to make a host allocation if the buffer is not // initialized with user data. if (MemObj->hasUserDataPtr()) { - QueueImplPtr DefaultHostQueue = - Scheduler::getInstance().getDefaultHostQueue(); AllocaCommand *HostAllocaCmd = new AllocaCommand( - DefaultHostQueue, FullReq, true /* InitFromUserData */, + nullptr, FullReq, true /* InitFromUserData */, nullptr /* LinkedAllocaCmd */, MemObj->isHostPointerReadOnly() /* IsConst */); Record->MAllocaCommands.push_back(HostAllocaCmd); Record->MWriteLeaves.push_back(HostAllocaCmd, ToEnqueue); ++(HostAllocaCmd->MLeafCounter); - Record->MCurContext = DefaultHostQueue->getContextImplPtr(); + Record->usedOnHost(); } } } else { @@ -766,7 +766,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( // new one. There could be situations when we could setup link with // "not" current allocation, but it will require memory copy. // Can setup link between cl and host allocations only - if (Queue->is_host() != Record->MCurContext->is_host()) { + if ((Context != nullptr) + (Record->MCurContext != nullptr) == 1) { // Linked commands assume that the host allocation is reused by the // plugin runtime and that can lead to unnecessary copy overhead on // devices that do not support host unified memory. Do not link the @@ -778,7 +778,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( bool PinnedHostMemory = MemObj->usesPinnedHostMemory(); bool HostUnifiedMemoryOnNonHostDevice = - Queue->is_host() ? checkHostUnifiedMemory(Record->MCurContext) + Queue == nullptr ? checkHostUnifiedMemory(Record->MCurContext) : HostUnifiedMemory; if (PinnedHostMemory || HostUnifiedMemoryOnNonHostDevice) { AllocaCommandBase *LinkedAllocaCmdCand = findAllocaForReq( @@ -818,14 +818,14 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( // construction, host allocation doesn't. So, device allocation should // always be active here. Also if the "follower" command is a device one // we have to change current context to the device one. - if (Queue->is_host()) { + if (Queue == nullptr) { AllocaCmd->MIsActive = false; } else { LinkedAllocaCmd->MIsActive = false; Record->MCurContext = Queue->getContextImplPtr(); std::set Deps = - findDepsForReq(Record, Req, Queue->getContextImplPtr()); + findDepsForReq(Record, Req, Context); for (Command *Dep : Deps) { Command *ConnCmd = AllocaCmd->addDep( DepDesc{Dep, Req, LinkedAllocaCmd}, ToCleanUp); @@ -1071,7 +1071,7 @@ void Scheduler::GraphBuilder::createGraphForCommand( if (isSameCtx) { // If the memory is already in the required host context, check if the // required access mode is valid, remap if not. - if (Record->MCurContext->is_host() && + if (!Record->MCurContext && !isAccessModeAllowed(Req->MAccessMode, Record->MHostAccess)) { remapMemoryObject(Record, Req, Req->MIsSubBuffer @@ -1093,7 +1093,7 @@ void Scheduler::GraphBuilder::createGraphForCommand( NeedMemMoveToHost = true; MemMoveTargetQueue = HT.MQueue; } - } else if (!Queue->is_host() && !Record->MCurContext->is_host()) + } else if (Queue && Record->MCurContext) NeedMemMoveToHost = true; if (NeedMemMoveToHost) @@ -1714,12 +1714,12 @@ Command *Scheduler::GraphBuilder::addCommandGraphUpdate( bool NeedMemMoveToHost = false; auto MemMoveTargetQueue = Queue; - if (!Queue->is_host() && !Record->MCurContext->is_host()) + if (Queue && Record->MCurContext) NeedMemMoveToHost = true; if (NeedMemMoveToHost) insertMemoryMove(Record, Req, - Scheduler::getInstance().getDefaultHostQueue(), + nullptr, ToEnqueue); insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue); } diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 6fa95cb4a4a54..bcb930bc8194a 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -199,12 +199,12 @@ using FusionMap = std::unordered_map; /// There must be a single MemObjRecord for each SYCL memory object. /// /// \ingroup sycl_graph -struct MemObjRecord { +class MemObjRecord { MemObjRecord(ContextImplPtr Ctx, std::size_t LeafLimit, LeavesCollection::AllocateDependencyF AllocateDependency) : MReadLeaves{this, LeafLimit, AllocateDependency}, - MWriteLeaves{this, LeafLimit, AllocateDependency}, MCurContext{Ctx} {} - + MWriteLeaves{this, LeafLimit, AllocateDependency}, MCurContext{Ctx}, MCurHostAccess{ MCurContext == nullptr } {} +public: // Contains all allocation commands for the memory object. std::vector MAllocaCommands; @@ -214,16 +214,32 @@ struct MemObjRecord { // Contains latest write commands working with memory object. LeavesCollection MWriteLeaves; + // The flag indicates that the content of the memory object was/will be + // modified. Used while deciding if copy back needed. + bool MMemModified = false; + + void usedOnDevice(ContextImplPtr& NewContext) + { + MCurContext = NewContext; + MCurHostAccess = false; + } + + void usedOnHost() + { + MCurContext = nullptr; + MCurHostAccess = true; + } + + bool usedOnHost() { return MCurHostAccess; } +protected: // The context which has the latest state of the memory object. ContextImplPtr MCurContext; - // The mode this object can be accessed with from the host context. - // Valid only if the current context is host. + // The mode this object can be accessed with from the host (host_accessor). + // Valid only if the current usage is on host. access::mode MHostAccess = access::mode::read_write; - // The flag indicates that the content of the memory object was/will be - // modified. Used while deciding if copy back needed. - bool MMemModified = false; + bool MCurHostAccess = false; }; /// DPC++ graph scheduler class. From 21ed380f362dd560342f75f94a58b84da50edd9c Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 29 May 2024 05:58:36 -0700 Subject: [PATCH 12/58] non-buildable: eliminate getDefaultHostQueue usage Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/memory_manager.cpp | 14 +-- sycl/source/detail/scheduler/commands.cpp | 6 +- .../source/detail/scheduler/graph_builder.cpp | 96 +++++++++---------- sycl/source/detail/scheduler/scheduler.cpp | 4 +- sycl/source/detail/scheduler/scheduler.hpp | 18 ++-- 5 files changed, 65 insertions(+), 73 deletions(-) diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index 792c1c57bd3f1..3c0ad08e0763f 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -750,23 +750,23 @@ void MemoryManager::copy(SYCLMemObjI *SYCLMemObj, void *SrcMem, sycl::detail::pi::PiEvent &OutEvent, const detail::EventImplPtr &OutEventImpl) { - if (SrcQueue->is_host()) { - if (TgtQueue->is_host()) - copyH2H(SYCLMemObj, (char *)SrcMem, std::move(SrcQueue), DimSrc, SrcSize, + if (!SrcQueue) { + if (!TgtQueue) + copyH2H(SYCLMemObj, (char *)SrcMem, nullptr, DimSrc, SrcSize, SrcAccessRange, SrcOffset, SrcElemSize, (char *)DstMem, - std::move(TgtQueue), DimDst, DstSize, DstAccessRange, DstOffset, + nullptr, DimDst, DstSize, DstAccessRange, DstOffset, DstElemSize, std::move(DepEvents), OutEvent, OutEventImpl); else - copyH2D(SYCLMemObj, (char *)SrcMem, std::move(SrcQueue), DimSrc, SrcSize, + copyH2D(SYCLMemObj, (char *)SrcMem, nullptr, DimSrc, SrcSize, SrcAccessRange, SrcOffset, SrcElemSize, pi::cast(DstMem), std::move(TgtQueue), DimDst, DstSize, DstAccessRange, DstOffset, DstElemSize, std::move(DepEvents), OutEvent, OutEventImpl); } else { - if (TgtQueue->is_host()) + if (!TgtQueue) copyD2H(SYCLMemObj, pi::cast(SrcMem), std::move(SrcQueue), DimSrc, SrcSize, SrcAccessRange, SrcOffset, - SrcElemSize, (char *)DstMem, std::move(TgtQueue), DimDst, DstSize, + SrcElemSize, (char *)DstMem, nullptr, DimDst, DstSize, DstAccessRange, DstOffset, DstElemSize, std::move(DepEvents), OutEvent, OutEventImpl); else diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 0a25d7b3ee6c1..f0e3471a0f6f6 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2872,7 +2872,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(), MQueue, Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset, Req->MElemSize, Copy->getDst(), - Scheduler::getInstance().getDefaultHostQueue(), Req->MDims, + nullptr, Req->MDims, Req->MAccessRange, Req->MAccessRange, /*DstOffset=*/{0, 0, 0}, Req->MElemSize, std::move(RawEvents), MEvent->getHandleRef(), MEvent); @@ -2883,11 +2883,9 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { Requirement *Req = (Requirement *)(Copy->getDst()); AllocaCommandBase *AllocaCmd = getAllocaForReq(Req); - Scheduler::getInstance().getDefaultHostQueue(); - MemoryManager::copy( AllocaCmd->getSYCLMemObj(), Copy->getSrc(), - Scheduler::getInstance().getDefaultHostQueue(), Req->MDims, + nullptr, Req->MDims, Req->MAccessRange, Req->MAccessRange, /*SrcOffset*/ {0, 0, 0}, Req->MElemSize, AllocaCmd->getMemAllocation(), MQueue, Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset, diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index bbb6d8de12f98..6c9244f9ecb2c 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -49,15 +49,16 @@ static bool doOverlap(const Requirement *LHS, const Requirement *RHS) { LHS->MOffsetInBytes); } -static bool sameCtx(const ContextImplPtr &LHS, const ContextImplPtr &RHS) { - return LHS == RHS; -} - /// Checks if current requirement is requirement for sub buffer. static bool IsSuitableSubReq(const Requirement *Req) { return Req->MIsSubBuffer; } +static ContextImplPtr GetContext(const QueueImplPtr& Queue) +{ + return Queue ? Queue->getContextImplPtr() : nullptr; +} + /// Checks if the required access mode is allowed under the current one. static bool isAccessModeAllowed(access::mode Required, access::mode Current) { switch (Current) { @@ -243,7 +244,7 @@ MemObjRecord *Scheduler::GraphBuilder::getOrInsertMemObjRecord( getOrCreateAllocaForReq(MemObject->MRecord.get(), Req, InteropQueuePtr, ToEnqueue); } else - MemObject->MRecord.reset(new MemObjRecord{Queue ? Queue->getContextImplPtr() : nullptr, + MemObject->MRecord.reset(new MemObjRecord{GetContext(Queue), LeafLimit, AllocateDependency}); MMemObjs.push_back(MemObject); @@ -282,8 +283,9 @@ void Scheduler::GraphBuilder::addNodeToLeaves( UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd( MemObjRecord *Record, Requirement *Req, const QueueImplPtr &Queue, std::vector &ToEnqueue) { + auto Context = GetContext(Queue); AllocaCommandBase *AllocaCmd = - findAllocaForReq(Record, Req, Queue->getContextImplPtr()); + findAllocaForReq(Record, Req, Context); assert(AllocaCmd && "There must be alloca for requirement!"); UpdateHostRequirementCommand *UpdateCommand = new UpdateHostRequirementCommand(Queue, *Req, AllocaCmd, &Req->MData); @@ -292,7 +294,7 @@ UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd( const Requirement *StoredReq = UpdateCommand->getRequirement(); std::set Deps = - findDepsForReq(Record, Req, Queue->getContextImplPtr()); + findDepsForReq(Record, Req, Context); std::vector ToCleanUp; for (Command *Dep : Deps) { Command *ConnCmd = @@ -345,8 +347,9 @@ Command *Scheduler::GraphBuilder::insertMemoryMove( if (!AllocaCmdDst) throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY); + auto Context = GetContext(Queue); std::set Deps = - findDepsForReq(Record, Req, Queue->getContextImplPtr()); + findDepsForReq(Record, Req, Context); Deps.insert(AllocaCmdDst); // Get parent allocation of sub buffer to perform full copy of whole buffer if (IsSuitableSubReq(Req)) { @@ -362,8 +365,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove( // current context, need to find a parent alloca command for it (it must be // there) auto IsSuitableAlloca = [Record](AllocaCommandBase *AllocaCmd) { - bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(), - Record->MCurContext) && + bool Res = Record->isSameContext(AllocaCmd->getQueue()) && // Looking for a parent buffer alloca command AllocaCmd->getType() == Command::CommandType::ALLOCA; return Res; @@ -398,7 +400,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove( if ((Req->MAccessMode == access::mode::discard_write) || (Req->MAccessMode == access::mode::discard_read_write)) { - Record->MCurContext = Queue->getContextImplPtr(); + Record->updateUsage(Context); return nullptr; } else { // Full copy of buffer is needed to avoid loss of data that may be caused @@ -420,7 +422,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove( addNodeToLeaves(Record, NewCmd, access::mode::read_write, ToEnqueue); for (Command *Cmd : ToCleanUp) cleanupCommand(Cmd); - Record->MCurContext = Queue->getContextImplPtr(); + Record->updateUsage(Context); return NewCmd; } @@ -474,7 +476,6 @@ Command *Scheduler::GraphBuilder::remapMemoryObject( Command * Scheduler::GraphBuilder::addCopyBack(Requirement *Req, std::vector &ToEnqueue) { - QueueImplPtr HostQueue = Scheduler::getInstance().getDefaultHostQueue(); SYCLMemObjI *MemObj = Req->MSYCLMemObj; MemObjRecord *Record = getMemObjRecord(MemObj); if (Record && MPrintOptionsArray[BeforeAddCopyBack]) @@ -485,13 +486,13 @@ Scheduler::GraphBuilder::addCopyBack(Requirement *Req, return nullptr; std::set Deps = - findDepsForReq(Record, Req, HostQueue->getContextImplPtr()); + findDepsForReq(Record, Req, nullptr); AllocaCommandBase *SrcAllocaCmd = findAllocaForReq(Record, Req, Record->MCurContext); auto MemCpyCmdUniquePtr = std::make_unique( *SrcAllocaCmd->getRequirement(), SrcAllocaCmd, *Req, &Req->MData, - SrcAllocaCmd->getQueue(), std::move(HostQueue)); + SrcAllocaCmd->getQueue(), nullptr); if (!MemCpyCmdUniquePtr) throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY); @@ -534,8 +535,7 @@ Scheduler::GraphBuilder::addHostAccessor(Requirement *Req, AllocaCommandBase *HostAllocaCmd = getOrCreateAllocaForReq(Record, Req, nullptr, ToEnqueue); - if (sameCtx(HostAllocaCmd->getQueue()->getContextImplPtr(), - Record->MCurContext)) { + if (Record->isSameContext(HostAllocaCmd->getQueue())) { if (!isAccessModeAllowed(Req->MAccessMode, Record->MHostAccess)) { remapMemoryObject(Record, Req, Req->MIsSubBuffer ? (static_cast( @@ -545,15 +545,14 @@ Scheduler::GraphBuilder::addHostAccessor(Requirement *Req, ToEnqueue); } } else - insertMemoryMove(Record, Req, HostQueue, ToEnqueue); + insertMemoryMove(Record, Req, nullptr, ToEnqueue); Command *UpdateHostAccCmd = - insertUpdateHostReqCmd(Record, Req, HostQueue, ToEnqueue); + insertUpdateHostReqCmd(Record, Req, nullptr, ToEnqueue); // Need empty command to be blocked until host accessor is destructed EmptyCommand *EmptyCmd = - addEmptyCmd(UpdateHostAccCmd, {Req}, HostQueue, - Command::BlockReason::HostAccessor, ToEnqueue); + addEmptyCmd(UpdateHostAccCmd, {Req}, Command::BlockReason::HostAccessor, ToEnqueue); Req->MBlockedCmd = EmptyCmd; @@ -564,14 +563,14 @@ Scheduler::GraphBuilder::addHostAccessor(Requirement *Req, } Command *Scheduler::GraphBuilder::addCGUpdateHost( - std::unique_ptr CommandGroup, const QueueImplPtr &HostQueue, + std::unique_ptr CommandGroup, std::vector &ToEnqueue) { auto UpdateHost = static_cast(CommandGroup.get()); Requirement *Req = UpdateHost->getReqToUpdate(); - MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req, ToEnqueue); - return insertMemoryMove(Record, Req, HostQueue, ToEnqueue); + MemObjRecord *Record = getOrInsertMemObjRecord(nullptr, Req, ToEnqueue); + return insertMemoryMove(Record, Req, nullptr, ToEnqueue); } /// Start the search for the record from list of "leaf" commands and check if @@ -618,8 +617,10 @@ Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record, // Going through copying memory between contexts is not supported. if (Dep.MDepCommand) - CanBypassDep &= - sameCtx(Context, Dep.MDepCommand->getQueue()->getContextImplPtr()); + { + auto DepQueue = Dep.MDepCommand->getQueue(); + CanBypassDep &= IsOnSameContext(Context, DepQueue); + } if (!CanBypassDep) { RetDeps.insert(DepCmd); @@ -658,7 +659,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::findAllocaForReq( bool AllowConst) { auto IsSuitableAlloca = [&Context, Req, AllowConst](AllocaCommandBase *AllocaCmd) { - bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(), Context); + bool Res = IsOnSameContext(Context, AllocaCmd->getQueue()); if (IsSuitableSubReq(Req)) { const Requirement *TmpReq = AllocaCmd->getRequirement(); Res &= AllocaCmd->getType() == Command::CommandType::ALLOCA_SUB_BUF; @@ -698,7 +699,7 @@ static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx) { AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( MemObjRecord *Record, const Requirement *Req, const QueueImplPtr &Queue, std::vector &ToEnqueue) { - auto Context = Queue != nullptr ? Queue->getContextImplPtr() : nullptr; + auto Context = GetContext(Queue); AllocaCommandBase *AllocaCmd = findAllocaForReq( Record, Req, Context, /*AllowConst=*/false); @@ -754,7 +755,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( Record->MAllocaCommands.push_back(HostAllocaCmd); Record->MWriteLeaves.push_back(HostAllocaCmd, ToEnqueue); ++(HostAllocaCmd->MLeafCounter); - Record->usedOnHost(); + Record->updateUsage(nullptr); } } } else { @@ -766,7 +767,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( // new one. There could be situations when we could setup link with // "not" current allocation, but it will require memory copy. // Can setup link between cl and host allocations only - if ((Context != nullptr) + (Record->MCurContext != nullptr) == 1) { + if ((Context != nullptr) + (Record->usedOnDevice()) == 1) { // Linked commands assume that the host allocation is reused by the // plugin runtime and that can lead to unnecessary copy overhead on // devices that do not support host unified memory. Do not link the @@ -822,7 +823,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( AllocaCmd->MIsActive = false; } else { LinkedAllocaCmd->MIsActive = false; - Record->MCurContext = Queue->getContextImplPtr(); + Record->updateUsage(Context); std::set Deps = findDepsForReq(Record, Req, Context); @@ -865,10 +866,9 @@ void Scheduler::GraphBuilder::markModifiedIfWrite(MemObjRecord *Record, EmptyCommand *Scheduler::GraphBuilder::addEmptyCmd( Command *Cmd, const std::vector &Reqs, - const QueueImplPtr &Queue, Command::BlockReason Reason, + Command::BlockReason Reason, std::vector &ToEnqueue, const bool AddDepsToLeaves) { - EmptyCommand *EmptyCmd = - new EmptyCommand(Scheduler::getInstance().getDefaultHostQueue()); + EmptyCommand *EmptyCmd = new EmptyCommand(); if (!EmptyCmd) throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY); @@ -878,9 +878,9 @@ EmptyCommand *Scheduler::GraphBuilder::addEmptyCmd( EmptyCmd->MBlockReason = Reason; for (Requirement *Req : Reqs) { - MemObjRecord *Record = getOrInsertMemObjRecord(Queue, Req, ToEnqueue); + MemObjRecord *Record = getOrInsertMemObjRecord(nullptr, Req, ToEnqueue); AllocaCommandBase *AllocaCmd = - getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue); + getOrCreateAllocaForReq(Record, Req, nullptr, ToEnqueue); EmptyCmd->addRequirement(Cmd, AllocaCmd, Req); } // addRequirement above call addDep that already will add EmptyCmd as user for @@ -1062,8 +1062,7 @@ void Scheduler::GraphBuilder::createGraphForCommand( AllocaCmd = getOrCreateAllocaForReq(Record, Req, QueueForAlloca, ToEnqueue); - isSameCtx = - sameCtx(QueueForAlloca->getContextImplPtr(), Record->MCurContext); + isSameCtx = Record->isSameContext(QueueForAlloca); } // If there is alloca command we need to check if the latest memory is in @@ -1071,7 +1070,7 @@ void Scheduler::GraphBuilder::createGraphForCommand( if (isSameCtx) { // If the memory is already in the required host context, check if the // required access mode is valid, remap if not. - if (!Record->MCurContext && + if (!Record->usedOnDevice() && !isAccessModeAllowed(Req->MAccessMode, Record->MHostAccess)) { remapMemoryObject(Record, Req, Req->MIsSubBuffer @@ -1089,21 +1088,20 @@ void Scheduler::GraphBuilder::createGraphForCommand( if (isInteropTask) { const detail::CGHostTask &HT = static_cast(CG); - if (HT.MQueue->getContextImplPtr() != Record->MCurContext) { + if (!(Record->isSameContext(HT.MQueue)) { NeedMemMoveToHost = true; MemMoveTargetQueue = HT.MQueue; } - } else if (Queue && Record->MCurContext) + } else if (Queue && Record->usedOnDevice()) NeedMemMoveToHost = true; if (NeedMemMoveToHost) - insertMemoryMove(Record, Req, - Scheduler::getInstance().getDefaultHostQueue(), - ToEnqueue); + insertMemoryMove(Record, Req, nullptr, ToEnqueue); insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue); } + std::set Deps = - findDepsForReq(Record, Req, Queue->getContextImplPtr()); + findDepsForReq(Record, Req, GetContext(Queue)); for (Command *Dep : Deps) { if (Dep != NewCmd) { @@ -1343,7 +1341,7 @@ Command *Scheduler::GraphBuilder::connectDepEvent( CG::CodeplayHostTask, /* Payload */ {})); ConnectCmd = new ExecCGCommand( - std::move(ConnectCG), Scheduler::getInstance().getDefaultHostQueue()); + std::move(ConnectCG), Cmd->getQueue()); } catch (const std::bad_alloc &) { throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY); } @@ -1705,7 +1703,7 @@ Command *Scheduler::GraphBuilder::addCommandGraphUpdate( AllocaCmd = getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue); - isSameCtx = sameCtx(Queue->getContextImplPtr(), Record->MCurContext); + isSameCtx = Record->isSameContext(Queue); } if (!isSameCtx) { @@ -1714,7 +1712,7 @@ Command *Scheduler::GraphBuilder::addCommandGraphUpdate( bool NeedMemMoveToHost = false; auto MemMoveTargetQueue = Queue; - if (Queue && Record->MCurContext) + if (Queue && Record->usedOnDevice()) NeedMemMoveToHost = true; if (NeedMemMoveToHost) @@ -1724,7 +1722,7 @@ Command *Scheduler::GraphBuilder::addCommandGraphUpdate( insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue); } std::set Deps = - findDepsForReq(Record, Req, Queue->getContextImplPtr()); + findDepsForReq(Record, Req, GetContext(Queue)); for (Command *Dep : Deps) { if (Dep != NewCmd.get()) { diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index 0b061a86dbc62..7e5db05daf01a 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -118,12 +118,12 @@ EventImplPtr Scheduler::addCG( switch (Type) { case CG::UpdateHost: NewCmd = MGraphBuilder.addCGUpdateHost(std::move(CommandGroup), - DefaultHostQueue, AuxiliaryCmds); + AuxiliaryCmds); NewEvent = NewCmd->getEvent(); break; case CG::CodeplayHostTask: { auto Result = MGraphBuilder.addCG(std::move(CommandGroup), - DefaultHostQueue, AuxiliaryCmds); + nullptr, AuxiliaryCmds); NewCmd = Result.NewCmd; NewEvent = Result.NewEvent; ShouldEnqueue = Result.ShouldEnqueue; diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index bcb930bc8194a..6a2bcc4e5004a 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -203,7 +203,7 @@ class MemObjRecord { MemObjRecord(ContextImplPtr Ctx, std::size_t LeafLimit, LeavesCollection::AllocateDependencyF AllocateDependency) : MReadLeaves{this, LeafLimit, AllocateDependency}, - MWriteLeaves{this, LeafLimit, AllocateDependency}, MCurContext{Ctx}, MCurHostAccess{ MCurContext == nullptr } {} + MWriteLeaves{this, LeafLimit, AllocateDependency}, MCurContext{Ctx} {} public: // Contains all allocation commands for the memory object. std::vector MAllocaCommands; @@ -218,19 +218,19 @@ class MemObjRecord { // modified. Used while deciding if copy back needed. bool MMemModified = false; - void usedOnDevice(ContextImplPtr& NewContext) + void updateUsage(ContextImplPtr& NewContext) { MCurContext = NewContext; - MCurHostAccess = false; } - void usedOnHost() + bool isSameContext(const QueueImplPtr& Queue) const { - MCurContext = nullptr; - MCurHostAccess = true; + // Covers case for host usage (nullptr == nullptr) and existing device contexts comparison. + return LHS == (Queue ? Queue->getContextImplPtr() : nullptr); } - bool usedOnHost() { return MCurHostAccess; } + bool usedOnDevice( return MCurContext != nullptr; ) + protected: // The context which has the latest state of the memory object. ContextImplPtr MCurContext; @@ -238,8 +238,6 @@ class MemObjRecord { // The mode this object can be accessed with from the host (host_accessor). // Valid only if the current usage is on host. access::mode MHostAccess = access::mode::read_write; - - bool MCurHostAccess = false; }; /// DPC++ graph scheduler class. @@ -621,7 +619,6 @@ class Scheduler { /// /// \return a command that represents command group execution. Command *addCGUpdateHost(std::unique_ptr CommandGroup, - const QueueImplPtr &HostQueue, std::vector &ToEnqueue); /// Enqueues a command to update memory to the latest state. @@ -759,7 +756,6 @@ class Scheduler { EmptyCommand *addEmptyCmd(Command *Cmd, const std::vector &Req, - const QueueImplPtr &Queue, Command::BlockReason Reason, std::vector &ToEnqueue, const bool AddDepsToLeaves = true); From c533af788609ed1b86dd27307eb48045f05c7565 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Tue, 4 Jun 2024 03:41:44 -0700 Subject: [PATCH 13/58] non-buildable: cleanup queue usages Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/queue_impl.cpp | 3 +- sycl/source/detail/scheduler/commands.cpp | 208 +++++++++------------- 2 files changed, 88 insertions(+), 123 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index bba423df61b60..c1c1d3835a54d 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -26,7 +26,8 @@ namespace sycl { inline namespace _V1 { namespace detail { -std::atomic queue_impl::MNextAvailableQueueID = 0; +// Treat 0 as reserved for "host" queue +std::atomic queue_impl::MNextAvailableQueueID = 1; static std::vector getPIEvents(const std::vector &DepEvents) { diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index f0e3471a0f6f6..f7962bb7a5d66 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -56,7 +56,7 @@ namespace detail { // Global graph for the application extern xpti::trace_event_data_t *GSYCLGraphEvent; -bool CurrentCodeLocationValid() { +static bool CurrentCodeLocationValid() { detail::tls_code_loc_t Tls; auto CodeLoc = Tls.query(); auto FileName = CodeLoc.fileName(); @@ -65,7 +65,7 @@ bool CurrentCodeLocationValid() { (FunctionName && FunctionName[0] != '\0'); } -void emitInstrumentationGeneral(uint32_t StreamID, uint64_t InstanceID, +static void emitInstrumentationGeneral(uint32_t StreamID, uint64_t InstanceID, xpti_td *TraceEvent, uint16_t Type, const void *Addr) { if (!(xptiCheckTraceEnabled(StreamID, Type) && TraceEvent)) @@ -74,6 +74,17 @@ void emitInstrumentationGeneral(uint32_t StreamID, uint64_t InstanceID, xptiNotifySubscribers(StreamID, Type, detail::GSYCLGraphEvent, static_cast(TraceEvent), InstanceID, Addr); } + +static addDeviceMetadata(xpti_td* TraceEvent, const QueueImplPtr& Queue) +{ + xpti::addMetadata(TraceEvent, "sycl_device", + Queue ? deviceToID(MQueue->get_device()) : nullptr); + xpti::addMetadata(TraceEvent, "sycl_device_type", + Queue ? deviceToString(MQueue->get_device()) : "host"); + if (Queue) + xpti::addMetadata(TraceEvent, "sycl_device_name", + getSyclObjImpl(MQueue->get_device())->getDeviceName()); +} #endif #ifdef __SYCL_ENABLE_GNU_DEMANGLING @@ -236,9 +247,7 @@ Command::getPiEvents(const std::vector &EventImpls) const { // current one is a host task. In this case we should not skip pi event due // to different sync mechanisms for different task types on in-order queue. const QueueImplPtr &WorkerQueue = getWorkerQueue(); - // MWorkerQueue in command is always not null. So check if - // EventImpl->getWorkerQueue != nullptr is implicit. - if (EventImpl->getWorkerQueue() == WorkerQueue && + if (WorkerQueue && EventImpl->getWorkerQueue() == WorkerQueue && WorkerQueue->isInOrder() && !isHostTask()) continue; @@ -278,9 +287,7 @@ std::vector Command::getPiEventsBlocking( // current one is a host task. In this case we should not skip pi event due // to different sync mechanisms for different task types on in-order queue. const QueueImplPtr &WorkerQueue = getWorkerQueue(); - // MWorkerQueue in command is always not null. So check if - // EventImpl->getWorkerQueue != nullptr is implicit. - if (EventImpl->getWorkerQueue() == WorkerQueue && + if (MWorkerQueue && EventImpl->getWorkerQueue() == WorkerQueue && WorkerQueue->isInOrder() && !isHostTask()) continue; @@ -337,12 +344,10 @@ class DispatchHostTask { PluginWithEvents.first->call(RawEvents.size(), RawEvents.data()); } catch (const sycl::exception &E) { - CGHostTask &HostTask = static_cast(MThisCmd->getCG()); - HostTask.MQueue->reportAsyncException(std::current_exception()); + MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(std::current_exception()); return (pi_result)E.get_cl_code(); } catch (...) { - CGHostTask &HostTask = static_cast(MThisCmd->getCG()); - HostTask.MQueue->reportAsyncException(std::current_exception()); + MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(std::current_exception()); return PI_ERROR_UNKNOWN; } } @@ -383,7 +388,7 @@ class DispatchHostTask { std::exception_ptr EPtr = std::make_exception_ptr(sycl::runtime_error( std::string("Couldn't wait for host-task's dependencies"), WaitResult)); - HostTask.MQueue->reportAsyncException(EPtr); + MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(EPtr); // reset host-task's lambda and quit HostTask.MHostTask.reset(); Scheduler::getInstance().NotifyHostTaskCompletion(MThisCmd); @@ -394,7 +399,7 @@ class DispatchHostTask { // we're ready to call the user-defined lambda now if (HostTask.MHostTask->isInteropTask()) { interop_handle IH{MReqToMem, HostTask.MQueue, - HostTask.MQueue->getDeviceImplPtr(), + // HostTask.MQueue->getDeviceImplPtr(), HostTask.MQueue->getContextImplPtr()}; HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo(), IH); @@ -419,7 +424,7 @@ class DispatchHostTask { } } #endif - HostTask.MQueue->reportAsyncException(CurrentException); + MthisCmd->MEvent->getSubmittedQueue()->reportAsyncException(CurrentException); } HostTask.MHostTask.reset(); @@ -436,7 +441,7 @@ class DispatchHostTask { Scheduler::getInstance().NotifyHostTaskCompletion(MThisCmd); } catch (...) { auto CurrentException = std::current_exception(); - HostTask.MQueue->reportAsyncException(CurrentException); + MthisCmd->MEvent->getSubmittedQueue()->reportAsyncException(CurrentException); } } }; @@ -449,6 +454,7 @@ void Command::waitForPreparedHostEvents() const { void Command::waitForEvents(QueueImplPtr Queue, std::vector &EventImpls, sycl::detail::pi::PiEvent &Event) { + assert(Queue && "Device queue is expected here"); if (!EventImpls.empty()) { #ifndef NDEBUG for (const EventImplPtr &Event : EventImpls) @@ -484,7 +490,7 @@ Command::Command( MEvent->setWorkerQueue(MWorkerQueue); MEvent->setSubmittedQueue(MWorkerQueue); MEvent->setCommand(this); - MEvent->setContextImpl(MQueue->getContextImplPtr()); + MEvent->setContextImpl(MQueue ? MQueue->getContextImplPtr(): nullptr); MEvent->setStateIncomplete(); MEnqueueStatus = EnqueueResultT::SyclEnqueueReady; @@ -669,7 +675,7 @@ void Command::makeTraceEventEpilog() { Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, std::vector &ToCleanUp) { const QueueImplPtr &WorkerQueue = getWorkerQueue(); - const ContextImplPtr &WorkerContext = WorkerQueue->getContextImplPtr(); + const ContextImplPtr &WorkerContext = WorkerQueue ? WorkerQueue->getContextImplPtr() : nullptr; // 1. Non-host events can be ignored if they are not fully initialized. // 2. Some types of commands do not produce PI events after they are @@ -701,7 +707,8 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, } const ContextImplPtr &Command::getWorkerContext() const { - return MQueue->getContextImplPtr(); + assert(MWorkerQueue && "MWorkerQueue must not be nullptr"); + return MWorkerQueue->getContextImplPtr(); } const QueueImplPtr &Command::getWorkerQueue() const { @@ -963,16 +970,12 @@ void AllocaCommandBase::emitInstrumentationData() { // Set the relevant meta data properties for this command if (MTraceEvent && MFirstInstance) { xpti_td *TE = static_cast(MTraceEvent); - xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device())); - xpti::addMetadata(TE, "sycl_device_type", - deviceToString(MQueue->get_device())); - xpti::addMetadata(TE, "sycl_device_name", - getSyclObjImpl(MQueue->get_device())->getDeviceName()); + addDeviceMetadata(TE, MQueue); xpti::addMetadata(TE, "memory_object", reinterpret_cast(MAddress)); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue->getQueueID()); + MQueue ? MQueue->getQueueID() : 0); } #endif } @@ -1022,7 +1025,7 @@ pi_int32 AllocaCommand::enqueueImp() { void *HostPtr = nullptr; if (!MIsLeaderAlloca) { - if (MQueue->is_host()) { + if (!MQueue) { // Do not need to make allocation if we have a linked device allocation Command::waitForEvents(MQueue, EventImpls, Event); @@ -1033,7 +1036,7 @@ pi_int32 AllocaCommand::enqueueImp() { // TODO: Check if it is correct to use std::move on stack variable and // delete it RawEvents below. MMemAllocation = MemoryManager::allocate( - MQueue->getContextImplPtr(), getSYCLMemObj(), MInitFromUserData, HostPtr, + MQueue ? MQueue->getContextImplPtr() : nullptr, getSYCLMemObj(), MInitFromUserData, HostPtr, std::move(EventImpls), Event); return PI_SUCCESS; @@ -1043,7 +1046,7 @@ void AllocaCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "ALLOCA ON " << deviceToString(MQueue->get_device()) << "\\n"; + Stream << "ALLOCA ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n"; Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n"; Stream << " Link : " << this->MLinkedAllocaCmd << "\\n"; Stream << "\"];" << std::endl; @@ -1092,7 +1095,7 @@ void AllocaSubBufCommand::emitInstrumentationData() { xpti::addMetadata(TE, "access_range_end", this->MRequirement.MAccessRange[1]); xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue->getQueueID()); + MQueue ? MQueue->getQueueID() : 0); makeTraceEventEpilog(); } #endif @@ -1102,7 +1105,7 @@ void *AllocaSubBufCommand::getMemAllocation() const { // In some cases parent`s memory allocation might change (e.g., after // map/unmap operations). If parent`s memory allocation changes, sub-buffer // memory allocation should be changed as well. - if (MQueue->is_host()) { + if (!MQueue) { return static_cast( static_cast(MParentAlloca->getMemAllocation()) + MRequirement.MOffsetInBytes); @@ -1116,7 +1119,7 @@ pi_int32 AllocaSubBufCommand::enqueueImp() { sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef(); MMemAllocation = MemoryManager::allocateMemSubBuffer( - MQueue->getContextImplPtr(), MParentAlloca->getMemAllocation(), + MQueue ? MQueue->getContextImplPtr() : nullptr, MParentAlloca->getMemAllocation(), MRequirement.MElemSize, MRequirement.MOffsetInBytes, MRequirement.MAccessRange, std::move(EventImpls), Event); @@ -1129,7 +1132,7 @@ void AllocaSubBufCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "ALLOCA SUB BUF ON " << deviceToString(MQueue->get_device()) + Stream << "ALLOCA SUB BUF ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n"; Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n"; Stream << " Offset : " << this->MRequirement.MOffsetInBytes << "\\n"; @@ -1163,17 +1166,13 @@ void ReleaseCommand::emitInstrumentationData() { if (MFirstInstance) { xpti_td *TE = static_cast(MTraceEvent); - xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device())); - xpti::addMetadata(TE, "sycl_device_type", - deviceToString(MQueue->get_device())); - xpti::addMetadata(TE, "sycl_device_name", - getSyclObjImpl(MQueue->get_device())->getDeviceName()); + addDeviceMetadata(TE, MQueue); xpti::addMetadata(TE, "allocation_type", commandToName(MAllocaCmd->getType())); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue->getQueueID()); + MQueue ? MQueue->getQueueID() : 0); makeTraceEventEpilog(); } #endif @@ -1187,9 +1186,9 @@ pi_int32 ReleaseCommand::enqueueImp() { // On host side we only allocate memory for full buffers. // Thus, deallocating sub buffers leads to double memory freeing. - SkipRelease |= MQueue->is_host() && MAllocaCmd->getType() == ALLOCA_SUB_BUF; + SkipRelease |= !MQueue && MAllocaCmd->getType() == ALLOCA_SUB_BUF; - const bool CurAllocaIsHost = MAllocaCmd->getQueue()->is_host(); + const bool CurAllocaIsHost = !MAllocaCmd->getQueue(); bool NeedUnmap = false; if (MAllocaCmd->MLinkedAllocaCmd) { @@ -1213,7 +1212,7 @@ pi_int32 ReleaseCommand::enqueueImp() { : MAllocaCmd->getQueue(); EventImplPtr UnmapEventImpl(new event_impl(Queue)); - UnmapEventImpl->setContextImpl(Queue->getContextImplPtr()); + UnmapEventImpl->setContextImpl(Queue ? Queue->getContextImplPtr() : nullptr); UnmapEventImpl->setStateIncomplete(); sycl::detail::pi::PiEvent &UnmapEvent = UnmapEventImpl->getHandleRef(); @@ -1237,7 +1236,7 @@ pi_int32 ReleaseCommand::enqueueImp() { Command::waitForEvents(MQueue, EventImpls, Event); else { MemoryManager::release( - MQueue->getContextImplPtr(), MAllocaCmd->getSYCLMemObj(), + MQueue ? MQueue->getContextImplPtr() : nullptr, MAllocaCmd->getSYCLMemObj(), MAllocaCmd->getMemAllocation(), std::move(EventImpls), Event); } return PI_SUCCESS; @@ -1247,7 +1246,7 @@ void ReleaseCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FF827A\", label=\""; Stream << "ID = " << this << " ; "; - Stream << "RELEASE ON " << deviceToString(MQueue->get_device()) << "\\n"; + Stream << "RELEASE ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n"; Stream << " Alloca : " << MAllocaCmd << "\\n"; Stream << " MemObj : " << MAllocaCmd->getSYCLMemObj() << "\\n"; Stream << "\"];" << std::endl; @@ -1287,16 +1286,12 @@ void MapMemObject::emitInstrumentationData() { if (MFirstInstance) { xpti_td *TE = static_cast(MTraceEvent); - xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device())); - xpti::addMetadata(TE, "sycl_device_type", - deviceToString(MQueue->get_device())); - xpti::addMetadata(TE, "sycl_device_name", - getSyclObjImpl(MQueue->get_device())->getDeviceName()); + addDeviceMetadata(TE, MQueue); xpti::addMetadata(TE, "memory_object", reinterpret_cast(MAddress)); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue->getQueueID()); + MQueue ? MQueue->getQueueID() : 0); makeTraceEventEpilog(); } #endif @@ -1321,7 +1316,7 @@ void MapMemObject::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#77AFFF\", label=\""; Stream << "ID = " << this << " ; "; - Stream << "MAP ON " << deviceToString(MQueue->get_device()) << "\\n"; + Stream << "MAP ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n"; Stream << "\"];" << std::endl; @@ -1352,16 +1347,12 @@ void UnMapMemObject::emitInstrumentationData() { if (MFirstInstance) { xpti_td *TE = static_cast(MTraceEvent); - xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device())); - xpti::addMetadata(TE, "sycl_device_type", - deviceToString(MQueue->get_device())); - xpti::addMetadata(TE, "sycl_device_name", - getSyclObjImpl(MQueue->get_device())->getDeviceName()); + addDeviceMetadata(TE, MQueue); xpti::addMetadata(TE, "memory_object", reinterpret_cast(MAddress)); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue->getQueueID()); + MQueue ? MQueue->getQueueID() : 0); makeTraceEventEpilog(); } #endif @@ -1383,9 +1374,9 @@ bool UnMapMemObject::producesPiEvent() const { // an event waitlist and Level Zero plugin attempts to batch these commands, // so the execution of kernel B starts only on step 4. This workaround // restores the old behavior in this case until this is resolved. - return MQueue->getDeviceImplPtr()->getBackend() != + return MQueue && (MQueue->getDeviceImplPtr()->getBackend() != backend::ext_oneapi_level_zero || - MEvent->getHandleRef() != nullptr; + MEvent->getHandleRef() != nullptr); } pi_int32 UnMapMemObject::enqueueImp() { @@ -1406,7 +1397,7 @@ void UnMapMemObject::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#EBC40F\", label=\""; Stream << "ID = " << this << " ; "; - Stream << "UNMAP ON " << deviceToString(MQueue->get_device()) << "\\n"; + Stream << "UNMAP ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n"; Stream << "\"];" << std::endl; @@ -1428,11 +1419,11 @@ MemCpyCommand::MemCpyCommand(Requirement SrcReq, MSrcQueue(SrcQueue), MSrcReq(std::move(SrcReq)), MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(DstReq)), MDstAllocaCmd(DstAllocaCmd) { - if (!MSrcQueue->is_host()) { + if (MSrcQueue) { MEvent->setContextImpl(MSrcQueue->getContextImplPtr()); } - MWorkerQueue = MQueue->is_host() ? MSrcQueue : MQueue; + MWorkerQueue = !MQueue ? MSrcQueue : MQueue; MEvent->setWorkerQueue(MWorkerQueue); emitInstrumentationDataProxy(); @@ -1449,24 +1440,19 @@ void MemCpyCommand::emitInstrumentationData() { if (MFirstInstance) { xpti_td *CmdTraceEvent = static_cast(MTraceEvent); - xpti::addMetadata(CmdTraceEvent, "sycl_device", - deviceToID(MQueue->get_device())); - xpti::addMetadata(CmdTraceEvent, "sycl_device_type", - deviceToString(MQueue->get_device())); - xpti::addMetadata(CmdTraceEvent, "sycl_device_name", - getSyclObjImpl(MQueue->get_device())->getDeviceName()); + addDeviceMetadata(CmdTraceEvent, MQueue); xpti::addMetadata(CmdTraceEvent, "memory_object", reinterpret_cast(MAddress)); xpti::addMetadata(CmdTraceEvent, "copy_from", - reinterpret_cast( - getSyclObjImpl(MSrcQueue->get_device()).get())); + MSrcQueue ? reinterpret_cast( + getSyclObjImpl(MSrcQueue->get_device()).get()) : nullptr); xpti::addMetadata( CmdTraceEvent, "copy_to", - reinterpret_cast(getSyclObjImpl(MQueue->get_device()).get())); + MQueue ? reinterpret_cast(getSyclObjImpl(MQueue->get_device()).get()): nullptr); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue->getQueueID()); + MQueue ? MQueue->getQueueID() : 0); makeTraceEventEpilog(); } #endif @@ -1492,7 +1478,7 @@ bool MemCpyCommand::producesPiEvent() const { // an event waitlist and Level Zero plugin attempts to batch these commands, // so the execution of kernel B starts only on step 4. This workaround // restores the old behavior in this case until this is resolved. - return MQueue->is_host() || + return !MQueue || MQueue->getDeviceImplPtr()->getBackend() != backend::ext_oneapi_level_zero || MEvent->getHandleRef() != nullptr; @@ -1521,10 +1507,10 @@ void MemCpyCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#C7EB15\" label=\""; Stream << "ID = " << this << " ; "; - Stream << "MEMCPY ON " << deviceToString(MQueue->get_device()) << "\\n"; - Stream << "From: " << MSrcAllocaCmd << " is host: " << MSrcQueue->is_host() + Stream << "MEMCPY ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n"; + Stream << "From: " << MSrcAllocaCmd << " is host: " << !MSrcQueue << "\\n"; - Stream << "To: " << MDstAllocaCmd << " is host: " << MQueue->is_host() + Stream << "To: " << MDstAllocaCmd << " is host: " << !MQueue << "\\n"; Stream << "\"];" << std::endl; @@ -1579,7 +1565,7 @@ void UpdateHostRequirementCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#f1337f\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "UPDATE REQ ON " << deviceToString(MQueue->get_device()) << "\\n"; + Stream << "UPDATE REQ ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n"; bool IsReqOnBuffer = MDstReq.MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer; Stream << "TYPE: " << (IsReqOnBuffer ? "Buffer" : "Image") << "\\n"; @@ -1606,11 +1592,11 @@ MemCpyCommandHost::MemCpyCommandHost(Requirement SrcReq, : Command(CommandType::COPY_MEMORY, std::move(DstQueue)), MSrcQueue(SrcQueue), MSrcReq(std::move(SrcReq)), MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(DstReq)), MDstPtr(DstPtr) { - if (!MSrcQueue->is_host()) { + if (MSrcQueue) { MEvent->setContextImpl(MSrcQueue->getContextImplPtr()); } - MWorkerQueue = MQueue->is_host() ? MSrcQueue : MQueue; + MWorkerQueue = !MQueue ? MSrcQueue : MQueue; MEvent->setWorkerQueue(MWorkerQueue); emitInstrumentationDataProxy(); @@ -1627,24 +1613,19 @@ void MemCpyCommandHost::emitInstrumentationData() { if (MFirstInstance) { xpti_td *CmdTraceEvent = static_cast(MTraceEvent); - xpti::addMetadata(CmdTraceEvent, "sycl_device", - deviceToID(MQueue->get_device())); - xpti::addMetadata(CmdTraceEvent, "sycl_device_type", - deviceToString(MQueue->get_device())); - xpti::addMetadata(CmdTraceEvent, "sycl_device_name", - getSyclObjImpl(MQueue->get_device())->getDeviceName()); + addDeviceMetadata(CmdTraceEvent, MQueue); xpti::addMetadata(CmdTraceEvent, "memory_object", reinterpret_cast(MAddress)); xpti::addMetadata(CmdTraceEvent, "copy_from", reinterpret_cast( - getSyclObjImpl(MSrcQueue->get_device()).get())); + MSrcQueue ? getSyclObjImpl(MSrcQueue->get_device()).get()) : "nullptr"); xpti::addMetadata( CmdTraceEvent, "copy_to", - reinterpret_cast(getSyclObjImpl(MQueue->get_device()).get())); + MQueue ? reinterpret_cast(getSyclObjImpl(MQueue->get_device()).get()) : "nullptr"); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue->getQueueID()); + MQueue ? MQueue->getQueueID() : 0); makeTraceEventEpilog(); } #endif @@ -1726,18 +1707,13 @@ void EmptyCommand::emitInstrumentationData() { if (MFirstInstance) { xpti_td *CmdTraceEvent = static_cast(MTraceEvent); - xpti::addMetadata(CmdTraceEvent, "sycl_device", - deviceToID(MQueue->get_device())); - xpti::addMetadata(CmdTraceEvent, "sycl_device_type", - deviceToString(MQueue->get_device())); - xpti::addMetadata(CmdTraceEvent, "sycl_device_name", - getSyclObjImpl(MQueue->get_device())->getDeviceName()); + addDeviceMetadata(CmdTraceEvent, MQueue); xpti::addMetadata(CmdTraceEvent, "memory_object", reinterpret_cast(MAddress)); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue->getQueueID()); + MQueue ? MQueue->getQueueID() : 0); makeTraceEventEpilog(); } #endif @@ -1766,7 +1742,7 @@ void MemCpyCommandHost::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#B6A2EB\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "MEMCPY HOST ON " << deviceToString(MQueue->get_device()) << "\\n"; + Stream << "MEMCPY HOST ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n"; Stream << "\"];" << std::endl; @@ -1799,18 +1775,13 @@ void UpdateHostRequirementCommand::emitInstrumentationData() { if (MFirstInstance) { xpti_td *CmdTraceEvent = static_cast(MTraceEvent); - xpti::addMetadata(CmdTraceEvent, "sycl_device", - deviceToID(MQueue->get_device())); - xpti::addMetadata(CmdTraceEvent, "sycl_device_type", - deviceToString(MQueue->get_device())); - xpti::addMetadata(CmdTraceEvent, "sycl_device_name", - getSyclObjImpl(MQueue->get_device())->getDeviceName()); + addDeviceMetadata(CmdTraceEvent, MQueue); xpti::addMetadata(CmdTraceEvent, "memory_object", reinterpret_cast(MAddress)); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue->getQueueID()); + MQueue ? MQueue->getQueueID() : 0); makeTraceEventEpilog(); } #endif @@ -1960,6 +1931,7 @@ void instrumentationAddExtraKernelMetadata( if (!SyclKernel->isCreatedFromSource()) EliminatedArgMask = SyclKernel->getKernelArgMask(); } else { + assert(Queue && "Queue with submitted kernel could not be on host"); std::tie(Kernel, KernelMutex, EliminatedArgMask, Program) = detail::ProgramManager::getInstance().getOrCreateKernel( Queue->getContextImplPtr(), Queue->getDeviceImplPtr(), KernelName); @@ -2024,12 +1996,7 @@ void instrumentationFillCommonData(const std::string &KernelName, if (CGKernelInstanceNo > 1) return; - xpti::addMetadata(CmdTraceEvent, "sycl_device", - deviceToID(Queue->get_device())); - xpti::addMetadata(CmdTraceEvent, "sycl_device_type", - deviceToString(Queue->get_device())); - xpti::addMetadata(CmdTraceEvent, "sycl_device_name", - getSyclObjImpl(Queue->get_device())->getDeviceName()); + addDeviceMetadata(CmdTraceEvent, Queue); if (!KernelName.empty()) { xpti::addMetadata(CmdTraceEvent, "kernel_name", KernelName); } @@ -2080,7 +2047,7 @@ std::pair emitKernelInstrumentationData( if (CmdTraceEvent) { // Stash the queue_id mutable metadata in TLS xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - Queue->getQueueID()); + MQueue ? MQueue->getQueueID() : 0); instrumentationAddExtraKernelMetadata(CmdTraceEvent, NDRDesc, KernelBundleImplPtr, SyclKernelName, @@ -2126,7 +2093,7 @@ void ExecCGCommand::emitInstrumentationData() { if (CmdTraceEvent) { xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue->getQueueID()); + MQueue ? MQueue->getQueueID() : 0); MTraceEvent = static_cast(CmdTraceEvent); if (MCommandGroup->getType() == detail::CG::Kernel) { auto KernelCG = @@ -2149,7 +2116,7 @@ void ExecCGCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "EXEC CG ON " << deviceToString(MQueue->get_device()) << "\\n"; + Stream << "EXEC CG ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n"; switch (MCommandGroup->getType()) { case detail::CG::Kernel: { @@ -2330,6 +2297,7 @@ static pi_result SetKernelParamsAndLaunch( const KernelArgMask *EliminatedArgMask, const std::function &getMemAllocationFunc, bool IsCooperative) { + assert(Queue && "Queue with submitted kernel could not be on host"); const PluginPtr &Plugin = Queue->getPlugin(); auto setFunc = [&Plugin, Kernel, &DeviceImageImpl, &getMemAllocationFunc, @@ -2521,7 +2489,7 @@ pi_int32 enqueueImpKernel( const std::function &getMemAllocationFunc, sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig, const bool KernelIsCooperative) { - + assert(Queue && "Queue with submitted kernel could not be on host"); // Run OpenCL kernel auto ContextImpl = Queue->getContextImplPtr(); auto DeviceImpl = Queue->getDeviceImplPtr(); @@ -2636,6 +2604,7 @@ enqueueReadWriteHostPipe(const QueueImplPtr &Queue, const std::string &PipeName, bool blocking, void *ptr, size_t size, std::vector &RawEvents, const detail::EventImplPtr &OutEventImpl, bool read) { + assert(Queue && "Queue with submitted read write host pipe could not be on host"); detail::HostPipeMapEntry *hostPipeEntry = ProgramManager::getInstance().getHostPipeEntry(PipeName); @@ -3309,19 +3278,14 @@ void KernelFusionCommand::emitInstrumentationData() { // This function is called in the constructor of the command. At this point // the kernel fusion list is still empty, so we don't have a terrible lot of // information we could attach to this node here. - if (MFirstInstance && CmdTraceEvent) { - xpti::addMetadata(CmdTraceEvent, "sycl_device", - deviceToID(MQueue->get_device())); - xpti::addMetadata(CmdTraceEvent, "sycl_device_type", - deviceToString(MQueue->get_device())); - xpti::addMetadata(CmdTraceEvent, "sycl_device_name", - getSyclObjImpl(MQueue->get_device())->getDeviceName()); - } + if (MFirstInstance && CmdTraceEvent) + addDeviceMetadata(CmdTraceEVent, MQueue); + if (MFirstInstance) { // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue->getQueueID()); + MQueue ? MQueue->getQueueID() : 0); xptiNotifySubscribers(MStreamID, NotificationTraceType, detail::GSYCLGraphEvent, static_cast(MTraceEvent), MInstanceID, @@ -3335,7 +3299,7 @@ void KernelFusionCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "KERNEL FUSION on " << deviceToString(MQueue->get_device()) << "\\n" + Stream << "KERNEL FUSION on " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n" << "FUSION LIST: {"; bool Initial = true; for (auto *Cmd : MFusionList) { From f0868f5ecb17b2886e999e4891725e1695e22c36 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 5 Jun 2024 04:31:26 -0700 Subject: [PATCH 14/58] handle nullptr Queue in commands.* Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/queue_impl.hpp | 6 ++-- sycl/source/detail/scheduler/commands.cpp | 39 ++++++++++++++++------- 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index c205b5916f302..15e19f143f29d 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -670,9 +670,9 @@ class queue_impl { MExceptions.PushBack(ExceptionPtr); } - ThreadPool &getThreadPool() { - return GlobalHandler::instance().getHostTaskThreadPool(); - } + // ThreadPool &getThreadPool() { + // return GlobalHandler::instance().getHostTaskThreadPool(); + // } /// Gets the native handle of the SYCL queue. /// diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index f7962bb7a5d66..55b29ac7dd426 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -87,6 +87,13 @@ static addDeviceMetadata(xpti_td* TraceEvent, const QueueImplPtr& Queue) } #endif +static ContextImplPtr getContext(const QueueImplPtr& Queue) +{ + if (Queue) + return Queue->getContextImplPtr(); + return nullptr; +} + #ifdef __SYCL_ENABLE_GNU_DEMANGLING struct DemangleHandle { char *p; @@ -490,7 +497,8 @@ Command::Command( MEvent->setWorkerQueue(MWorkerQueue); MEvent->setSubmittedQueue(MWorkerQueue); MEvent->setCommand(this); - MEvent->setContextImpl(MQueue ? MQueue->getContextImplPtr(): nullptr); + if (MQueue) + MEvent->setContextImpl(MQueue->getContextImplPtr()); MEvent->setStateIncomplete(); MEnqueueStatus = EnqueueResultT::SyclEnqueueReady; @@ -707,12 +715,12 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, } const ContextImplPtr &Command::getWorkerContext() const { - assert(MWorkerQueue && "MWorkerQueue must not be nullptr"); + if (!MWorkerQueue) + return nullptr; return MWorkerQueue->getContextImplPtr(); } const QueueImplPtr &Command::getWorkerQueue() const { - assert(MWorkerQueue && "MWorkerQueue must not be nullptr"); return MWorkerQueue; } @@ -1036,7 +1044,7 @@ pi_int32 AllocaCommand::enqueueImp() { // TODO: Check if it is correct to use std::move on stack variable and // delete it RawEvents below. MMemAllocation = MemoryManager::allocate( - MQueue ? MQueue->getContextImplPtr() : nullptr, getSYCLMemObj(), MInitFromUserData, HostPtr, + getContext(MQueue), getSYCLMemObj(), MInitFromUserData, HostPtr, std::move(EventImpls), Event); return PI_SUCCESS; @@ -1119,7 +1127,7 @@ pi_int32 AllocaSubBufCommand::enqueueImp() { sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef(); MMemAllocation = MemoryManager::allocateMemSubBuffer( - MQueue ? MQueue->getContextImplPtr() : nullptr, MParentAlloca->getMemAllocation(), + getContext(MQueue), MParentAlloca->getMemAllocation(), MRequirement.MElemSize, MRequirement.MOffsetInBytes, MRequirement.MAccessRange, std::move(EventImpls), Event); @@ -1212,7 +1220,7 @@ pi_int32 ReleaseCommand::enqueueImp() { : MAllocaCmd->getQueue(); EventImplPtr UnmapEventImpl(new event_impl(Queue)); - UnmapEventImpl->setContextImpl(Queue ? Queue->getContextImplPtr() : nullptr); + UnmapEventImpl->setContextImpl(getContext(Queue)); UnmapEventImpl->setStateIncomplete(); sycl::detail::pi::PiEvent &UnmapEvent = UnmapEventImpl->getHandleRef(); @@ -1236,7 +1244,7 @@ pi_int32 ReleaseCommand::enqueueImp() { Command::waitForEvents(MQueue, EventImpls, Event); else { MemoryManager::release( - MQueue ? MQueue->getContextImplPtr() : nullptr, MAllocaCmd->getSYCLMemObj(), + getContext(MQueue), MAllocaCmd->getSYCLMemObj(), MAllocaCmd->getMemAllocation(), std::move(EventImpls), Event); } return PI_SUCCESS; @@ -2654,6 +2662,7 @@ enqueueReadWriteHostPipe(const QueueImplPtr &Queue, const std::string &PipeName, } pi_int32 ExecCGCommand::enqueueImpCommandBuffer() { + assert(MQueue && "Device queue is required for command buffer enqueue"); // Wait on host command dependencies waitForPreparedHostEvents(); @@ -2819,8 +2828,8 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { auto RawEvents = getPiEvents(EventImpls); flushCrossQueueDeps(EventImpls, getWorkerQueue()); - bool DiscardPiEvent = (MQueue->supportsDiscardingPiEvents() && - MCommandGroup->getRequirements().size() == 0); + bool DiscardPiEvent = MQueue && MQueue->supportsDiscardingPiEvents() && + (MCommandGroup->getRequirements().size() == 0); sycl::detail::pi::PiEvent *Event = DiscardPiEvent ? nullptr : &MEvent->getHandleRef(); detail::EventImplPtr EventImpl = DiscardPiEvent ? nullptr : MEvent; @@ -2894,6 +2903,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { return PI_SUCCESS; } case CG::CGTYPE::Kernel: { + assert(MQueue && "Device queue must be present for kernel command"); CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get(); NDRDescT &NDRDesc = ExecKernel->MNDRDesc; @@ -3039,8 +3049,8 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { Req->MSYCLMemObj->MRecord->MAllocaCommands; for (AllocaCommandBase *AllocaCmd : AllocaCmds) - if (HostTask->MQueue->getContextImplPtr() == - AllocaCmd->getQueue()->getContextImplPtr()) { + if (getContext(HostTask->MQueue) == + getContext(AllocaCmd->getQueue()) { auto MemArg = reinterpret_cast(AllocaCmd->getMemAllocation()); ReqToMem.emplace_back(std::make_pair(Req, MemArg)); @@ -3064,7 +3074,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { // submitted to report exception origin properly. copySubmissionCodeLocation(); - MQueue->getThreadPool().submit( + getThreadPool().submit( DispatchHostTask(this, std::move(ReqToMem))); MShouldCompleteEventIfPossible = false; @@ -3072,6 +3082,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { return PI_SUCCESS; } case CG::CGTYPE::Barrier: { + assert(MQueue && "Device queue must be present for barrier command"); const PluginPtr &Plugin = MQueue->getPlugin(); if (MEvent != nullptr) MEvent->setHostEnqueueTime(); @@ -3081,6 +3092,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { return PI_SUCCESS; } case CG::CGTYPE::BarrierWaitlist: { + assert(MQueue && "Device queue must be present for barrier with wait list command"); CGBarrier *Barrier = static_cast(MCommandGroup.get()); std::vector Events = Barrier->MEventsWaitWithBarrier; std::vector PiEvents = @@ -3132,6 +3144,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { typeSize, RawEvents, EventImpl, read); } case CG::CGTYPE::ExecCommandBuffer: { + assert(MQueue && "Device queue must be present for command buffer enqueue"); CGExecCommandBuffer *CmdBufferCG = static_cast(MCommandGroup.get()); if (MEvent != nullptr) @@ -3155,6 +3168,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { return PI_SUCCESS; } case CG::CGTYPE::SemaphoreWait: { + assert(MQueue && "Device queue must be present for semaphore wait command"); CGSemaphoreWait *SemWait = (CGSemaphoreWait *)MCommandGroup.get(); const detail::PluginPtr &Plugin = MQueue->getPlugin(); @@ -3165,6 +3179,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { return PI_SUCCESS; } case CG::CGTYPE::SemaphoreSignal: { + assert(MQueue && "Device queue must be present for semaphore signal command"); CGSemaphoreSignal *SemSignal = (CGSemaphoreSignal *)MCommandGroup.get(); const detail::PluginPtr &Plugin = MQueue->getPlugin(); From 3d044e896cc6ff1d851c56268dfeb2dc623b55e9 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 5 Jun 2024 06:04:41 -0700 Subject: [PATCH 15/58] non-buildable: handle nullptr queue in memory_manager.cpp Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/event_impl.cpp | 12 +++++++----- sycl/source/detail/memory_manager.cpp | 22 ++++++++++++++++++++-- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index 28bb37200392a..be32787c0aa4d 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -149,15 +149,16 @@ event_impl::event_impl(sycl::detail::pi::PiEvent Event, } event_impl::event_impl(const QueueImplPtr &Queue) { - this->setContextImpl(Queue->getContextImplPtr()); + // Queue == nullptr means that it is a host task event + this->setContextImpl(getContext(Queue)); this->associateWithQueue(Queue); } void event_impl::associateWithQueue(const QueueImplPtr &Queue) { MQueue = Queue; - MIsProfilingEnabled = Queue->MIsProfilingEnabled; + MIsProfilingEnabled = Queue && Queue->MIsProfilingEnabled; MFallbackProfiling = MIsProfilingEnabled && Queue->isProfilingFallback(); - MState.store(HES_Complete); + MState.store(Queue ? HES_Complete : HES_NotComplete); } void *event_impl::instrumentationProlog(std::string &Name, int32_t StreamID, @@ -402,8 +403,9 @@ event_impl::get_backend_info() const { ->get_platform() .get_info(); } - return ""; // If the queue has been released, no platform will be associated - // so return empty string + // If the queue has been released, no platform will be associated + // so return empty string. + return ""; } template <> diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index 3c0ad08e0763f..30827adb15e8f 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -482,6 +482,7 @@ void copyH2D(SYCLMemObjI *SYCLMemObj, char *SrcMem, QueueImplPtr, const detail::EventImplPtr &OutEventImpl) { (void)SrcAccessRange; assert(SYCLMemObj && "The SYCLMemObj is nullptr"); + assert(TgtQueue && "Destination mem object queue must be not nullptr"); const sycl::detail::pi::PiQueue Queue = TgtQueue->getHandleRef(); const PluginPtr &Plugin = TgtQueue->getPlugin(); @@ -560,6 +561,7 @@ void copyD2H(SYCLMemObjI *SYCLMemObj, sycl::detail::pi::PiMem SrcMem, const detail::EventImplPtr &OutEventImpl) { (void)DstAccessRange; assert(SYCLMemObj && "The SYCLMemObj is nullptr"); + assert(SrcQueue && "Source mem object queue is expected to be not nullptr"); const sycl::detail::pi::PiQueue Queue = SrcQueue->getHandleRef(); const PluginPtr &Plugin = SrcQueue->getPlugin(); @@ -641,6 +643,7 @@ void copyD2D(SYCLMemObjI *SYCLMemObj, sycl::detail::pi::PiMem SrcMem, sycl::detail::pi::PiEvent &OutEvent, const detail::EventImplPtr &OutEventImpl) { assert(SYCLMemObj && "The SYCLMemObj is nullptr"); + assert(SrcQueue && TgtQueue && "Source mem object and target mem object queues are expected to be not nullptr"); const sycl::detail::pi::PiQueue Queue = SrcQueue->getHandleRef(); const PluginPtr &Plugin = SrcQueue->getPlugin(); @@ -804,6 +807,7 @@ void MemoryManager::fill(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, sycl::detail::pi::PiEvent &OutEvent, const detail::EventImplPtr &OutEventImpl) { assert(SYCLMemObj && "The SYCLMemObj is nullptr"); + assert(Queue && "Fill should be called only with a valid device queue"); const PluginPtr &Plugin = Queue->getPlugin(); @@ -861,7 +865,7 @@ void *MemoryManager::map(SYCLMemObjI *, void *Mem, QueueImplPtr Queue, unsigned int ElementSize, std::vector DepEvents, sycl::detail::pi::PiEvent &OutEvent) { - if (Queue->is_host()) { + if (!Queue) { throw runtime_error("Not supported configuration of map requested", PI_ERROR_INVALID_OPERATION); } @@ -907,6 +911,10 @@ void MemoryManager::unmap(SYCLMemObjI *, void *Mem, QueueImplPtr Queue, sycl::detail::pi::PiEvent &OutEvent) { // Host queue is not supported here. + if (!Queue) { + throw runtime_error("Not supported configuration of unmap requested", + PI_ERROR_INVALID_OPERATION); + } // All DepEvents are to the same Context. // Using the plugin of the Queue. @@ -921,6 +929,7 @@ void MemoryManager::copy_usm(const void *SrcMem, QueueImplPtr SrcQueue, std::vector DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl) { + assert(SrcQueue && "USM copy must be called with a valid device queue"); if (!Len) { // no-op, but ensure DepEvents will still be waited on if (!DepEvents.empty()) { if (OutEventImpl != nullptr) @@ -959,6 +968,7 @@ void MemoryManager::fill_usm(void *Mem, QueueImplPtr Queue, size_t Length, std::vector DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl) { + assert(Queue && "USM fill must be called with a valid device queue"); if (!Length) { // no-op, but ensure DepEvents will still be waited on if (!DepEvents.empty()) { if (OutEventImpl != nullptr) @@ -994,6 +1004,7 @@ void MemoryManager::prefetch_usm( std::vector DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl) { + assert(Queue && "USM prefetch must be called with a valid device queue"); const PluginPtr &Plugin = Queue->getPlugin(); if (OutEventImpl != nullptr) OutEventImpl->setHostEnqueueTime(); @@ -1015,6 +1026,7 @@ void MemoryManager::advise_usm( std::vector /*DepEvents*/, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl) { + assert(Queue && "USM advise must be called with a valid device queue"); const PluginPtr &Plugin = Queue->getPlugin(); if (OutEventImpl != nullptr) OutEventImpl->setHostEnqueueTime(); @@ -1037,6 +1049,7 @@ void MemoryManager::copy_2d_usm( std::vector DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl) { + assert(Queue && "USM copy 2d must be called with a valid device queue"); if (Width == 0 || Height == 0) { // no-op, but ensure DepEvents will still be waited on if (!DepEvents.empty()) { @@ -1122,6 +1135,7 @@ void MemoryManager::fill_2d_usm( std::vector DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl) { + assert(Queue && "USM fill 2d must be called with a valid device queue"); if (Width == 0 || Height == 0) { // no-op, but ensure DepEvents will still be waited on if (!DepEvents.empty()) { @@ -1159,6 +1173,7 @@ void MemoryManager::memset_2d_usm( char Value, std::vector DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl) { + assert(Queue && "USM memset 2d must be called with a valid device queue"); if (Width == 0 || Height == 0) { // no-op, but ensure DepEvents will still be waited on if (!DepEvents.empty()) { @@ -1198,6 +1213,7 @@ memcpyToDeviceGlobalUSM(QueueImplPtr Queue, const std::vector &DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl) { + assert(Queue && "Copy to device global USM must be called with a valid device queue"); // Get or allocate USM memory for the device_global. DeviceGlobalUSMMem &DeviceGlobalUSM = DeviceGlobalEntry->getOrAllocateDeviceGlobalUSM(Queue); @@ -1299,6 +1315,7 @@ static void memcpyToDeviceGlobalDirect( size_t NumBytes, size_t Offset, const void *Src, const std::vector &DepEvents, sycl::detail::pi::PiEvent *OutEvent) { + assert(Queue && "Direct copy to device global must be called with a valid device queue"); sycl::detail::pi::PiProgram Program = getOrBuildProgramForDeviceGlobal(Queue, DeviceGlobalEntry); const PluginPtr &Plugin = Queue->getPlugin(); @@ -1313,6 +1330,7 @@ static void memcpyFromDeviceGlobalDirect( size_t NumBytes, size_t Offset, void *Dest, const std::vector &DepEvents, sycl::detail::pi::PiEvent *OutEvent) { + assert(Queue && "Direct copy from device global must be called with a valid device queue"); sycl::detail::pi::PiProgram Program = getOrBuildProgramForDeviceGlobal(Queue, DeviceGlobalEntry); const PluginPtr &Plugin = Queue->getPlugin(); @@ -1722,7 +1740,7 @@ void MemoryManager::copy_image_bindless( sycl::detail::pi::PiImageRegion CopyExtent, const std::vector &DepEvents, sycl::detail::pi::PiEvent *OutEvent) { - + assert(Queue && "Copy image bindless must be called with a valid device queue"); assert((Flags == (sycl::detail::pi::PiImageCopyFlags) ext::oneapi::experimental::image_copy_flags::HtoD || Flags == (sycl::detail::pi::PiImageCopyFlags) From b3161e8bf8b978600e6910e7e8953a530ac26d23 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 5 Jun 2024 06:55:19 -0700 Subject: [PATCH 16/58] non-buildable: build enabling Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/queue_impl.hpp | 6 ++--- .../source/detail/scheduler/graph_builder.cpp | 6 +++++ sycl/source/detail/scheduler/scheduler.hpp | 23 ++++++++----------- sycl/source/handler.cpp | 9 ++++---- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 15e19f143f29d..a3463225a54d1 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -111,7 +111,7 @@ class queue_impl { MDiscardEvents( has_property()), MIsProfilingEnabled(has_property()), - MSupportsDiscardingPiEvents(MDiscardEvents && MIsInorder)), + MSupportsDiscardingPiEvents(MDiscardEvents && MIsInorder), MQueueID{ MNextAvailableQueueID.fetch_add(1, std::memory_order_relaxed)} { if (has_property()) { @@ -285,7 +285,7 @@ class queue_impl { MDiscardEvents( has_property()), MIsProfilingEnabled(has_property()), - MSupportsDiscardingPiEvents(MDiscardEvents && MIsInorder)), + MSupportsDiscardingPiEvents(MDiscardEvents && MIsInorder), MQueueID{ MNextAvailableQueueID.fetch_add(1, std::memory_order_relaxed)} { queue_impl_interop(PiQueue); @@ -305,7 +305,7 @@ class queue_impl { MDiscardEvents( has_property()), MIsProfilingEnabled(has_property()), - MSupportsDiscardingPiEvents(MDiscardEvents && MIsInorder)) { + MSupportsDiscardingPiEvents(MDiscardEvents && MIsInorder) { queue_impl_interop(PiQueue); } diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 6c9244f9ecb2c..d9614e9ca9d51 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -59,6 +59,12 @@ static ContextImplPtr GetContext(const QueueImplPtr& Queue) return Queue ? Queue->getContextImplPtr() : nullptr; } +bool MemObjRecord::isSameContext(const QueueImplPtr& Queue) const +{ + // Covers case for host usage (nullptr == nullptr) and existing device contexts comparison. + return MCurContext == (Queue ? Queue->getContextImplPtr() : nullptr); +} + /// Checks if the required access mode is allowed under the current one. static bool isAccessModeAllowed(access::mode Required, access::mode Current) { switch (Current) { diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 6a2bcc4e5004a..61f01863c477b 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -218,26 +218,21 @@ class MemObjRecord { // modified. Used while deciding if copy back needed. bool MMemModified = false; - void updateUsage(ContextImplPtr& NewContext) - { - MCurContext = NewContext; - } - - bool isSameContext(const QueueImplPtr& Queue) const - { - // Covers case for host usage (nullptr == nullptr) and existing device contexts comparison. - return LHS == (Queue ? Queue->getContextImplPtr() : nullptr); - } - - bool usedOnDevice( return MCurContext != nullptr; ) - -protected: // The context which has the latest state of the memory object. ContextImplPtr MCurContext; // The mode this object can be accessed with from the host (host_accessor). // Valid only if the current usage is on host. access::mode MHostAccess = access::mode::read_write; + + void updateUsage(ContextImplPtr& NewContext) + { + MCurContext = NewContext; + } + + bool isSameContext(const QueueImplPtr& Queue) const; + + bool usedOnDevice() { return MCurContext != nullptr; } }; /// DPC++ graph scheduler class. diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 749ab6750df5e..c0e0438d9cd2f 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -80,16 +80,15 @@ void *getValueFromDynamicParameter( } // namespace detail -handler::handler(std::shared_ptr Queue, bool IsHost) - : handler(Queue, Queue, nullptr, IsHost) {} +handler::handler(std::shared_ptr Queue) + : handler(Queue, Queue, nullptr) {} handler::handler(std::shared_ptr Queue, std::shared_ptr PrimaryQueue, - std::shared_ptr SecondaryQueue, - bool IsHost) + std::shared_ptr SecondaryQueue) : MImpl(std::make_shared(std::move(PrimaryQueue), std::move(SecondaryQueue))), - MQueue(std::move(Queue)), MIsHost(IsHost) {} + MQueue(std::move(Queue)) {} handler::handler( std::shared_ptr Graph) From 2258a1cbb812161a21af5dbb9a38c170a41badc8 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 5 Jun 2024 08:07:45 -0700 Subject: [PATCH 17/58] not-buildable: build enabling 2 Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/buffer_impl.cpp | 9 +- sycl/source/detail/event_impl.cpp | 2 +- sycl/source/detail/memory_manager.cpp | 4 +- sycl/source/detail/queue_impl.cpp | 4 +- sycl/source/detail/queue_impl.hpp | 5 + sycl/source/detail/scheduler/commands.cpp | 136 +++++++++--------- sycl/source/detail/scheduler/commands.hpp | 12 +- .../source/detail/scheduler/graph_builder.cpp | 51 +++---- sycl/source/detail/scheduler/scheduler.hpp | 12 +- 9 files changed, 108 insertions(+), 127 deletions(-) diff --git a/sycl/source/detail/buffer_impl.cpp b/sycl/source/detail/buffer_impl.cpp index d7d77205b162c..f13444107e9eb 100644 --- a/sycl/source/detail/buffer_impl.cpp +++ b/sycl/source/detail/buffer_impl.cpp @@ -68,10 +68,13 @@ buffer_impl::getNativeVector(backend BackendName) const { sycl::detail::pi::PiMem NativeMem = pi::cast(Cmd->getMemAllocation()); auto Ctx = Cmd->getWorkerContext(); - auto Platform = Ctx->getPlatformImpl(); // If Host Shared Memory is not supported then there is alloca for host that - // doesn't have platform - if (!Platform || (Platform->getBackend() != BackendName)) + // doesn't have context and platform + if (!Ctx) + continue; + PlatformImplPtr Platform = Ctx->getPlatformImpl(); + assert(Platform && "Platform must be present for device context"); + if (Platform->getBackend() != BackendName) continue; auto Plugin = Platform->getPlugin(); diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index be32787c0aa4d..e34597aa008d1 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -150,7 +150,7 @@ event_impl::event_impl(sycl::detail::pi::PiEvent Event, event_impl::event_impl(const QueueImplPtr &Queue) { // Queue == nullptr means that it is a host task event - this->setContextImpl(getContext(Queue)); + this->setContextImpl(queue_impl::getContext(Queue)); this->associateWithQueue(Queue); } diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index 30827adb15e8f..e2c22f794f587 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -413,7 +413,7 @@ void *MemoryManager::allocateMemSubBuffer(ContextImplPtr TargetContext, waitForEvents(DepEvents); OutEvent = nullptr; - if (TargetContext->is_host()) + if (!TargetContext) return static_cast(static_cast(ParentMemObj) + Offset); size_t SizeInBytes = ElemSize; @@ -643,7 +643,7 @@ void copyD2D(SYCLMemObjI *SYCLMemObj, sycl::detail::pi::PiMem SrcMem, sycl::detail::pi::PiEvent &OutEvent, const detail::EventImplPtr &OutEventImpl) { assert(SYCLMemObj && "The SYCLMemObj is nullptr"); - assert(SrcQueue && TgtQueue && "Source mem object and target mem object queues are expected to be not nullptr"); + assert(SrcQueue && "Source mem object and target mem object queues are expected to be not nullptr"); const sycl::detail::pi::PiQueue Queue = SrcQueue->getHandleRef(); const PluginPtr &Plugin = SrcQueue->getPlugin(); diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index c1c1d3835a54d..ce4dd462eef32 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -284,12 +284,12 @@ void queue_impl::addEvent(const event &Event) { // if there is no command on the event, we cannot track it with MEventsWeak // as that will leave it with no owner. Track in MEventsShared only if we're // unable to call piQueueFinish during wait. - if (Event->isHost() || MEmulateOOO) + if (EImpl->isHost() || MEmulateOOO) addSharedEvent(Event); } // As long as the queue supports piQueueFinish we only need to store events // for unenqueued commands and host tasks. - else if (Event->isHost() || MEmulateOOO || EImpl->getHandleRef() == nullptr) { + else if (EImpl->isHost() || MEmulateOOO || EImpl->getHandleRef() == nullptr) { std::weak_ptr EventWeakPtr{EImpl}; std::lock_guard Lock{MMutex}; MEventsWeak.push_back(std::move(EventWeakPtr)); diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index a3463225a54d1..61f34c35c7baf 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -750,6 +750,11 @@ class queue_impl { // tasks and host tasks is applicable for out of order queues only. Not neede // for in order ones. void revisitUnenqueuedCommandsState(const EventImplPtr &CompletedHostTask); + + static ContextImplPtr getContext(const QueueImplPtr& Queue) + { + return Queue ? Queue->getContextImplPtr() : nullptr; + } protected: event discard_or_return(const event &Event); diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 55b29ac7dd426..05873f23f45a9 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -75,16 +75,32 @@ static void emitInstrumentationGeneral(uint32_t StreamID, uint64_t InstanceID, static_cast(TraceEvent), InstanceID, Addr); } -static addDeviceMetadata(xpti_td* TraceEvent, const QueueImplPtr& Queue) +static size_t deviceToID(const device &Device) { + return reinterpret_cast(getSyclObjImpl(Device)->getHandleRef()); +} + +static std::string deviceToString(device Device) { + if (Device.is_cpu()) + return "CPU"; + else if (Device.is_gpu()) + return "GPU"; + else if (Device.is_accelerator()) + return "ACCELERATOR"; + else + return "UNKNOWN"; +} + +static void addDeviceMetadata(xpti_td* TraceEvent, const QueueImplPtr& Queue) { xpti::addMetadata(TraceEvent, "sycl_device", - Queue ? deviceToID(MQueue->get_device()) : nullptr); + Queue ? deviceToID(Queue->get_device()) : 0); xpti::addMetadata(TraceEvent, "sycl_device_type", - Queue ? deviceToString(MQueue->get_device()) : "host"); + Queue ? deviceToString(Queue->get_device()) : "host"); if (Queue) xpti::addMetadata(TraceEvent, "sycl_device_name", - getSyclObjImpl(MQueue->get_device())->getDeviceName()); + getSyclObjImpl(Queue->get_device())->getDeviceName()); } + #endif static ContextImplPtr getContext(const QueueImplPtr& Queue) @@ -113,17 +129,6 @@ static std::string demangleKernelName(std::string Name) { static std::string demangleKernelName(std::string Name) { return Name; } #endif -static std::string deviceToString(device Device) { - if (Device.is_cpu()) - return "CPU"; - else if (Device.is_gpu()) - return "GPU"; - else if (Device.is_accelerator()) - return "ACCELERATOR"; - else - return "UNKNOWN"; -} - void applyFuncOnFilteredArgs( const KernelArgMask *EliminatedArgMask, std::vector &Args, std::function Func) { @@ -158,12 +163,6 @@ void applyFuncOnFilteredArgs( } } -#ifdef XPTI_ENABLE_INSTRUMENTATION -static size_t deviceToID(const device &Device) { - return reinterpret_cast(getSyclObjImpl(Device)->getHandleRef()); -} -#endif - static std::string accessModeToString(access::mode Mode) { switch (Mode) { case access::mode::read: @@ -253,9 +252,8 @@ Command::getPiEvents(const std::vector &EventImpls) const { // At this stage dependency is definitely pi task and need to check if // current one is a host task. In this case we should not skip pi event due // to different sync mechanisms for different task types on in-order queue. - const QueueImplPtr &WorkerQueue = getWorkerQueue(); - if (WorkerQueue && EventImpl->getWorkerQueue() == WorkerQueue && - WorkerQueue->isInOrder() && !isHostTask()) + if (MWorkerQueue && EventImpl->getWorkerQueue() == MWorkerQueue && + MWorkerQueue->isInOrder() && !isHostTask()) continue; RetPiEvents.push_back(EventImpl->getHandleRef()); @@ -293,9 +291,8 @@ std::vector Command::getPiEventsBlocking( // At this stage dependency is definitely pi task and need to check if // current one is a host task. In this case we should not skip pi event due // to different sync mechanisms for different task types on in-order queue. - const QueueImplPtr &WorkerQueue = getWorkerQueue(); - if (MWorkerQueue && EventImpl->getWorkerQueue() == WorkerQueue && - WorkerQueue->isInOrder() && !isHostTask()) + if (MWorkerQueue && EventImpl->getWorkerQueue() == MWorkerQueue && + MWorkerQueue->isInOrder() && !isHostTask()) continue; RetPiEvents.push_back(EventImpl->getHandleRef()); @@ -431,7 +428,7 @@ class DispatchHostTask { } } #endif - MthisCmd->MEvent->getSubmittedQueue()->reportAsyncException(CurrentException); + MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(CurrentException); } HostTask.MHostTask.reset(); @@ -448,7 +445,7 @@ class DispatchHostTask { Scheduler::getInstance().NotifyHostTaskCompletion(MThisCmd); } catch (...) { auto CurrentException = std::current_exception(); - MthisCmd->MEvent->getSubmittedQueue()->reportAsyncException(CurrentException); + MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(CurrentException); } } }; @@ -471,7 +468,7 @@ void Command::waitForEvents(QueueImplPtr Queue, std::vector RawEvents = getPiEvents(EventImpls); - flushCrossQueueDeps(EventImpls, getWorkerQueue()); + flushCrossQueueDeps(EventImpls, MWorkerQueue); const PluginPtr &Plugin = Queue->getPlugin(); if (MEvent != nullptr) @@ -682,8 +679,7 @@ void Command::makeTraceEventEpilog() { Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, std::vector &ToCleanUp) { - const QueueImplPtr &WorkerQueue = getWorkerQueue(); - const ContextImplPtr &WorkerContext = WorkerQueue ? WorkerQueue->getContextImplPtr() : nullptr; + const ContextImplPtr &WorkerContext = getWorkerContext(); // 1. Non-host events can be ignored if they are not fully initialized. // 2. Some types of commands do not produce PI events after they are @@ -714,14 +710,10 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, return ConnectionCmd; } -const ContextImplPtr &Command::getWorkerContext() const { - if (!MWorkerQueue) +ContextImplPtr Command::getWorkerContext() const { + if (!MQueue) return nullptr; - return MWorkerQueue->getContextImplPtr(); -} - -const QueueImplPtr &Command::getWorkerQueue() const { - return MWorkerQueue; + return MQueue->getContextImplPtr(); } bool Command::producesPiEvent() const { return true; } @@ -1054,7 +1046,7 @@ void AllocaCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "ALLOCA ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n"; + Stream << "ALLOCA ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n"; Stream << " Link : " << this->MLinkedAllocaCmd << "\\n"; Stream << "\"];" << std::endl; @@ -1140,7 +1132,7 @@ void AllocaSubBufCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "ALLOCA SUB BUF ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" + Stream << "ALLOCA SUB BUF ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n"; Stream << " Offset : " << this->MRequirement.MOffsetInBytes << "\\n"; @@ -1254,7 +1246,7 @@ void ReleaseCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FF827A\", label=\""; Stream << "ID = " << this << " ; "; - Stream << "RELEASE ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n"; + Stream << "RELEASE ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; Stream << " Alloca : " << MAllocaCmd << "\\n"; Stream << " MemObj : " << MAllocaCmd->getSYCLMemObj() << "\\n"; Stream << "\"];" << std::endl; @@ -1309,7 +1301,7 @@ pi_int32 MapMemObject::enqueueImp() { waitForPreparedHostEvents(); std::vector EventImpls = MPreparedDepsEvents; std::vector RawEvents = getPiEvents(EventImpls); - flushCrossQueueDeps(EventImpls, getWorkerQueue()); + flushCrossQueueDeps(EventImpls, MWorkerQueue); sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef(); *MDstPtr = MemoryManager::map( @@ -1324,7 +1316,7 @@ void MapMemObject::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#77AFFF\", label=\""; Stream << "ID = " << this << " ; "; - Stream << "MAP ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n"; + Stream << "MAP ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; Stream << "\"];" << std::endl; @@ -1391,7 +1383,7 @@ pi_int32 UnMapMemObject::enqueueImp() { waitForPreparedHostEvents(); std::vector EventImpls = MPreparedDepsEvents; std::vector RawEvents = getPiEvents(EventImpls); - flushCrossQueueDeps(EventImpls, getWorkerQueue()); + flushCrossQueueDeps(EventImpls, MWorkerQueue); sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef(); MemoryManager::unmap(MDstAllocaCmd->getSYCLMemObj(), @@ -1405,7 +1397,7 @@ void UnMapMemObject::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#EBC40F\", label=\""; Stream << "ID = " << this << " ; "; - Stream << "UNMAP ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n"; + Stream << "UNMAP ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; Stream << "\"];" << std::endl; @@ -1452,11 +1444,10 @@ void MemCpyCommand::emitInstrumentationData() { xpti::addMetadata(CmdTraceEvent, "memory_object", reinterpret_cast(MAddress)); xpti::addMetadata(CmdTraceEvent, "copy_from", - MSrcQueue ? reinterpret_cast( - getSyclObjImpl(MSrcQueue->get_device()).get()) : nullptr); + MSrcQueue ? deviceToID(MSrcQueue->get_device()) : 0); xpti::addMetadata( CmdTraceEvent, "copy_to", - MQueue ? reinterpret_cast(getSyclObjImpl(MQueue->get_device()).get()): nullptr); + MQueue ? deviceToID(MQueue->get_device()): 0); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, @@ -1466,8 +1457,9 @@ void MemCpyCommand::emitInstrumentationData() { #endif } -const ContextImplPtr &MemCpyCommand::getWorkerContext() const { - return getWorkerQueue()->getContextImplPtr(); +ContextImplPtr MemCpyCommand::getWorkerContext() const { + assert(MWorkerQueue && "Worker queue for mem cpy command must be not nullptr"); + return MWorkerQueue->getContextImplPtr(); } bool MemCpyCommand::producesPiEvent() const { @@ -1499,7 +1491,7 @@ pi_int32 MemCpyCommand::enqueueImp() { sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef(); auto RawEvents = getPiEvents(EventImpls); - flushCrossQueueDeps(EventImpls, getWorkerQueue()); + flushCrossQueueDeps(EventImpls, MWorkerQueue); MemoryManager::copy( MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(), @@ -1515,7 +1507,7 @@ void MemCpyCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#C7EB15\" label=\""; Stream << "ID = " << this << " ; "; - Stream << "MEMCPY ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n"; + Stream << "MEMCPY ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; Stream << "From: " << MSrcAllocaCmd << " is host: " << !MSrcQueue << "\\n"; Stream << "To: " << MDstAllocaCmd << " is host: " << !MQueue @@ -1573,7 +1565,7 @@ void UpdateHostRequirementCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#f1337f\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "UPDATE REQ ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n"; + Stream << "UPDATE REQ ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; bool IsReqOnBuffer = MDstReq.MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer; Stream << "TYPE: " << (IsReqOnBuffer ? "Buffer" : "Image") << "\\n"; @@ -1625,11 +1617,10 @@ void MemCpyCommandHost::emitInstrumentationData() { xpti::addMetadata(CmdTraceEvent, "memory_object", reinterpret_cast(MAddress)); xpti::addMetadata(CmdTraceEvent, "copy_from", - reinterpret_cast( - MSrcQueue ? getSyclObjImpl(MSrcQueue->get_device()).get()) : "nullptr"); + MSrcQueue ? deviceToID(MSrcQueue->get_device()) : 0); xpti::addMetadata( CmdTraceEvent, "copy_to", - MQueue ? reinterpret_cast(getSyclObjImpl(MQueue->get_device()).get()) : "nullptr"); + MQueue ? deviceToID(MQueue->get_device()) : 0); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, @@ -1639,12 +1630,13 @@ void MemCpyCommandHost::emitInstrumentationData() { #endif } -const ContextImplPtr &MemCpyCommandHost::getWorkerContext() const { - return getWorkerQueue()->getContextImplPtr(); +ContextImplPtr MemCpyCommandHost::getWorkerContext() const { + assert(MWorkerQueue && "Worker queue for mem cpy host command must be not nullptr"); + return MWorkerQueue->getContextImplPtr(); } pi_int32 MemCpyCommandHost::enqueueImp() { - const QueueImplPtr &Queue = getWorkerQueue(); + const QueueImplPtr &Queue = MWorkerQueue; waitForPreparedHostEvents(); std::vector EventImpls = MPreparedDepsEvents; std::vector RawEvents = getPiEvents(EventImpls); @@ -1660,7 +1652,7 @@ pi_int32 MemCpyCommandHost::enqueueImp() { return PI_SUCCESS; } - flushCrossQueueDeps(EventImpls, getWorkerQueue()); + flushCrossQueueDeps(EventImpls, MWorkerQueue); MemoryManager::copy( MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(), MSrcQueue, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange, @@ -1671,8 +1663,8 @@ pi_int32 MemCpyCommandHost::enqueueImp() { return PI_SUCCESS; } -EmptyCommand::EmptyCommand(QueueImplPtr Queue) - : Command(CommandType::EMPTY_TASK, std::move(Queue)) { +EmptyCommand::EmptyCommand() + : Command(CommandType::EMPTY_TASK, nullptr) { emitInstrumentationDataProxy(); } @@ -1750,7 +1742,7 @@ void MemCpyCommandHost::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#B6A2EB\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "MEMCPY HOST ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n"; + Stream << "MEMCPY HOST ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; Stream << "\"];" << std::endl; @@ -2055,7 +2047,7 @@ std::pair emitKernelInstrumentationData( if (CmdTraceEvent) { // Stash the queue_id mutable metadata in TLS xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue ? MQueue->getQueueID() : 0); + Queue ? Queue->getQueueID() : 0); instrumentationAddExtraKernelMetadata(CmdTraceEvent, NDRDesc, KernelBundleImplPtr, SyclKernelName, @@ -2124,7 +2116,7 @@ void ExecCGCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "EXEC CG ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n"; + Stream << "EXEC CG ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; switch (MCommandGroup->getType()) { case detail::CG::Kernel: { @@ -2670,7 +2662,7 @@ pi_int32 ExecCGCommand::enqueueImpCommandBuffer() { // submissions of the command buffer itself will not receive dependencies on // them, e.g. initial copies from host to device std::vector EventImpls = MPreparedDepsEvents; - flushCrossQueueDeps(EventImpls, getWorkerQueue()); + flushCrossQueueDeps(EventImpls, MWorkerQueue); std::vector RawEvents = getPiEvents(EventImpls); if (!RawEvents.empty()) { const PluginPtr &Plugin = MQueue->getPlugin(); @@ -2826,7 +2818,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { waitForPreparedHostEvents(); std::vector EventImpls = MPreparedDepsEvents; auto RawEvents = getPiEvents(EventImpls); - flushCrossQueueDeps(EventImpls, getWorkerQueue()); + flushCrossQueueDeps(EventImpls, MWorkerQueue); bool DiscardPiEvent = MQueue && MQueue->supportsDiscardingPiEvents() && (MCommandGroup->getRequirements().size() == 0); @@ -3050,7 +3042,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { for (AllocaCommandBase *AllocaCmd : AllocaCmds) if (getContext(HostTask->MQueue) == - getContext(AllocaCmd->getQueue()) { + getContext(AllocaCmd->getQueue())) { auto MemArg = reinterpret_cast(AllocaCmd->getMemAllocation()); ReqToMem.emplace_back(std::make_pair(Req, MemArg)); @@ -3294,7 +3286,7 @@ void KernelFusionCommand::emitInstrumentationData() { // the kernel fusion list is still empty, so we don't have a terrible lot of // information we could attach to this node here. if (MFirstInstance && CmdTraceEvent) - addDeviceMetadata(CmdTraceEVent, MQueue); + addDeviceMetadata(CmdTraceEvent, MQueue); if (MFirstInstance) { // Since we do NOT add queue_id value to metadata, we are stashing it to TLS @@ -3314,7 +3306,7 @@ void KernelFusionCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "KERNEL FUSION on " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n" + Stream << "KERNEL FUSION on " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n" << "FUSION LIST: {"; bool Initial = true; for (auto *Cmd : MFusionList) { @@ -3354,7 +3346,7 @@ pi_int32 UpdateCommandBufferCommand::enqueueImp() { waitForPreparedHostEvents(); std::vector EventImpls = MPreparedDepsEvents; auto RawEvents = getPiEvents(EventImpls); - flushCrossQueueDeps(EventImpls, getWorkerQueue()); + flushCrossQueueDeps(EventImpls, MWorkerQueue); for (auto &Node : MNodes) { auto CG = static_cast(Node->MCommandGroup.get()); diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 89cabd134a7e1..ea2ba3ea72118 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -223,11 +223,7 @@ class Command { /// Get the context of the queue this command will be submitted to. Could /// differ from the context of MQueue for memory copy commands. - virtual const ContextImplPtr &getWorkerContext() const; - - /// Get the queue this command will be submitted to. Could differ from MQueue - /// for memory copy commands. - const QueueImplPtr &getWorkerQueue() const; + virtual ContextImplPtr getWorkerContext() const; /// Returns true iff the command produces a PI event on non-host devices. virtual bool producesPiEvent() const; @@ -414,7 +410,7 @@ class Command { /// implement lock in the graph, or to merge several nodes into one. class EmptyCommand : public Command { public: - EmptyCommand(QueueImplPtr Queue); + EmptyCommand(); void printDot(std::ostream &Stream) const final; const Requirement *getRequirement() const final { return &MRequirements[0]; } @@ -586,7 +582,7 @@ class MemCpyCommand : public Command { void printDot(std::ostream &Stream) const final; const Requirement *getRequirement() const final { return &MDstReq; } void emitInstrumentationData() final; - const ContextImplPtr &getWorkerContext() const final; + ContextImplPtr getWorkerContext() const final; bool producesPiEvent() const final; private: @@ -610,7 +606,7 @@ class MemCpyCommandHost : public Command { void printDot(std::ostream &Stream) const final; const Requirement *getRequirement() const final { return &MDstReq; } void emitInstrumentationData() final; - const ContextImplPtr &getWorkerContext() const final; + ContextImplPtr getWorkerContext() const final; private: pi_int32 enqueueImp() final; diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index d9614e9ca9d51..8778ad6927c3e 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -54,15 +54,10 @@ static bool IsSuitableSubReq(const Requirement *Req) { return Req->MIsSubBuffer; } -static ContextImplPtr GetContext(const QueueImplPtr& Queue) -{ - return Queue ? Queue->getContextImplPtr() : nullptr; -} - -bool MemObjRecord::isSameContext(const QueueImplPtr& Queue) const +static bool isOnSameContext(const ContextImplPtr Context, const QueueImplPtr& Queue) { // Covers case for host usage (nullptr == nullptr) and existing device contexts comparison. - return MCurContext == (Queue ? Queue->getContextImplPtr() : nullptr); + return Context == queue_impl::getContext(Queue); } /// Checks if the required access mode is allowed under the current one. @@ -250,7 +245,7 @@ MemObjRecord *Scheduler::GraphBuilder::getOrInsertMemObjRecord( getOrCreateAllocaForReq(MemObject->MRecord.get(), Req, InteropQueuePtr, ToEnqueue); } else - MemObject->MRecord.reset(new MemObjRecord{GetContext(Queue), + MemObject->MRecord.reset(new MemObjRecord{queue_impl::getContext(Queue), LeafLimit, AllocateDependency}); MMemObjs.push_back(MemObject); @@ -289,7 +284,7 @@ void Scheduler::GraphBuilder::addNodeToLeaves( UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd( MemObjRecord *Record, Requirement *Req, const QueueImplPtr &Queue, std::vector &ToEnqueue) { - auto Context = GetContext(Queue); + auto Context = queue_impl::getContext(Queue); AllocaCommandBase *AllocaCmd = findAllocaForReq(Record, Req, Context); assert(AllocaCmd && "There must be alloca for requirement!"); @@ -353,7 +348,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove( if (!AllocaCmdDst) throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY); - auto Context = GetContext(Queue); + auto Context = queue_impl::getContext(Queue); std::set Deps = findDepsForReq(Record, Req, Context); Deps.insert(AllocaCmdDst); @@ -371,7 +366,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove( // current context, need to find a parent alloca command for it (it must be // there) auto IsSuitableAlloca = [Record](AllocaCommandBase *AllocaCmd) { - bool Res = Record->isSameContext(AllocaCmd->getQueue()) && + bool Res = isOnSameContext(Record->MCurContext, AllocaCmd->getQueue()) && // Looking for a parent buffer alloca command AllocaCmd->getType() == Command::CommandType::ALLOCA; return Res; @@ -406,7 +401,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove( if ((Req->MAccessMode == access::mode::discard_write) || (Req->MAccessMode == access::mode::discard_read_write)) { - Record->updateUsage(Context); + Record->MCurContext = Context; return nullptr; } else { // Full copy of buffer is needed to avoid loss of data that may be caused @@ -428,7 +423,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove( addNodeToLeaves(Record, NewCmd, access::mode::read_write, ToEnqueue); for (Command *Cmd : ToCleanUp) cleanupCommand(Cmd); - Record->updateUsage(Context); + Record->MCurContext = Context; return NewCmd; } @@ -541,7 +536,7 @@ Scheduler::GraphBuilder::addHostAccessor(Requirement *Req, AllocaCommandBase *HostAllocaCmd = getOrCreateAllocaForReq(Record, Req, nullptr, ToEnqueue); - if (Record->isSameContext(HostAllocaCmd->getQueue())) { + if (isOnSameContext(Record->MCurContext, HostAllocaCmd->getQueue())) { if (!isAccessModeAllowed(Req->MAccessMode, Record->MHostAccess)) { remapMemoryObject(Record, Req, Req->MIsSubBuffer ? (static_cast( @@ -625,7 +620,7 @@ Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record, if (Dep.MDepCommand) { auto DepQueue = Dep.MDepCommand->getQueue(); - CanBypassDep &= IsOnSameContext(Context, DepQueue); + CanBypassDep &= isOnSameContext(Context, DepQueue); } if (!CanBypassDep) { @@ -665,7 +660,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::findAllocaForReq( bool AllowConst) { auto IsSuitableAlloca = [&Context, Req, AllowConst](AllocaCommandBase *AllocaCmd) { - bool Res = IsOnSameContext(Context, AllocaCmd->getQueue()); + bool Res = isOnSameContext(Context, AllocaCmd->getQueue()); if (IsSuitableSubReq(Req)) { const Requirement *TmpReq = AllocaCmd->getRequirement(); Res &= AllocaCmd->getType() == Command::CommandType::ALLOCA_SUB_BUF; @@ -705,7 +700,7 @@ static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx) { AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( MemObjRecord *Record, const Requirement *Req, const QueueImplPtr &Queue, std::vector &ToEnqueue) { - auto Context = GetContext(Queue); + auto Context = queue_impl::getContext(Queue); AllocaCommandBase *AllocaCmd = findAllocaForReq( Record, Req, Context, /*AllowConst=*/false); @@ -761,7 +756,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( Record->MAllocaCommands.push_back(HostAllocaCmd); Record->MWriteLeaves.push_back(HostAllocaCmd, ToEnqueue); ++(HostAllocaCmd->MLeafCounter); - Record->updateUsage(nullptr); + Record->MCurContext = nullptr; } } } else { @@ -773,7 +768,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( // new one. There could be situations when we could setup link with // "not" current allocation, but it will require memory copy. // Can setup link between cl and host allocations only - if ((Context != nullptr) + (Record->usedOnDevice()) == 1) { + if ((Context != nullptr) != (Record->MCurContext != nullptr)) { // Linked commands assume that the host allocation is reused by the // plugin runtime and that can lead to unnecessary copy overhead on // devices that do not support host unified memory. Do not link the @@ -829,7 +824,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( AllocaCmd->MIsActive = false; } else { LinkedAllocaCmd->MIsActive = false; - Record->updateUsage(Context); + Record->MCurContext =Context; std::set Deps = findDepsForReq(Record, Req, Context); @@ -1068,7 +1063,7 @@ void Scheduler::GraphBuilder::createGraphForCommand( AllocaCmd = getOrCreateAllocaForReq(Record, Req, QueueForAlloca, ToEnqueue); - isSameCtx = Record->isSameContext(QueueForAlloca); + isSameCtx = isOnSameContext(Record->MCurContext, QueueForAlloca); } // If there is alloca command we need to check if the latest memory is in @@ -1076,7 +1071,7 @@ void Scheduler::GraphBuilder::createGraphForCommand( if (isSameCtx) { // If the memory is already in the required host context, check if the // required access mode is valid, remap if not. - if (!Record->usedOnDevice() && + if (!Record->MCurContext && !isAccessModeAllowed(Req->MAccessMode, Record->MHostAccess)) { remapMemoryObject(Record, Req, Req->MIsSubBuffer @@ -1094,11 +1089,11 @@ void Scheduler::GraphBuilder::createGraphForCommand( if (isInteropTask) { const detail::CGHostTask &HT = static_cast(CG); - if (!(Record->isSameContext(HT.MQueue)) { + if (!isOnSameContext(Record->MCurContext, HT.MQueue)) { NeedMemMoveToHost = true; MemMoveTargetQueue = HT.MQueue; } - } else if (Queue && Record->usedOnDevice()) + } else if (Queue && Record->MCurContext) NeedMemMoveToHost = true; if (NeedMemMoveToHost) @@ -1107,7 +1102,7 @@ void Scheduler::GraphBuilder::createGraphForCommand( } std::set Deps = - findDepsForReq(Record, Req, GetContext(Queue)); + findDepsForReq(Record, Req, queue_impl::getContext(Queue)); for (Command *Dep : Deps) { if (Dep != NewCmd) { @@ -1709,7 +1704,7 @@ Command *Scheduler::GraphBuilder::addCommandGraphUpdate( AllocaCmd = getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue); - isSameCtx = Record->isSameContext(Queue); + isSameCtx = isOnSameContext(Record->MCurContext, Queue); } if (!isSameCtx) { @@ -1718,7 +1713,7 @@ Command *Scheduler::GraphBuilder::addCommandGraphUpdate( bool NeedMemMoveToHost = false; auto MemMoveTargetQueue = Queue; - if (Queue && Record->usedOnDevice()) + if (Queue && Record->MCurContext) NeedMemMoveToHost = true; if (NeedMemMoveToHost) @@ -1728,7 +1723,7 @@ Command *Scheduler::GraphBuilder::addCommandGraphUpdate( insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue); } std::set Deps = - findDepsForReq(Record, Req, GetContext(Queue)); + findDepsForReq(Record, Req, queue_impl::getContext(Queue)); for (Command *Dep : Deps) { if (Dep != NewCmd.get()) { diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 61f01863c477b..d3462872c9ddf 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -199,12 +199,11 @@ using FusionMap = std::unordered_map; /// There must be a single MemObjRecord for each SYCL memory object. /// /// \ingroup sycl_graph -class MemObjRecord { +struct MemObjRecord { MemObjRecord(ContextImplPtr Ctx, std::size_t LeafLimit, LeavesCollection::AllocateDependencyF AllocateDependency) : MReadLeaves{this, LeafLimit, AllocateDependency}, MWriteLeaves{this, LeafLimit, AllocateDependency}, MCurContext{Ctx} {} -public: // Contains all allocation commands for the memory object. std::vector MAllocaCommands; @@ -224,15 +223,6 @@ class MemObjRecord { // The mode this object can be accessed with from the host (host_accessor). // Valid only if the current usage is on host. access::mode MHostAccess = access::mode::read_write; - - void updateUsage(ContextImplPtr& NewContext) - { - MCurContext = NewContext; - } - - bool isSameContext(const QueueImplPtr& Queue) const; - - bool usedOnDevice() { return MCurContext != nullptr; } }; /// DPC++ graph scheduler class. From df27615254aff2efd52952930673920c521fd3fb Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 5 Jun 2024 08:49:20 -0700 Subject: [PATCH 18/58] almost buildable: build enabling 3 Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/queue_impl.hpp | 6 +++--- sycl/source/detail/scheduler/commands.cpp | 6 +++--- sycl/source/detail/scheduler/commands.hpp | 3 +-- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 61f34c35c7baf..3bd7b6ea7ec0a 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -670,9 +670,9 @@ class queue_impl { MExceptions.PushBack(ExceptionPtr); } - // ThreadPool &getThreadPool() { - // return GlobalHandler::instance().getHostTaskThreadPool(); - // } + static ThreadPool &getThreadPool() { + return GlobalHandler::instance().getHostTaskThreadPool(); + } /// Gets the native handle of the SYCL queue. /// diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 05873f23f45a9..d0a790ed97059 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -65,7 +65,7 @@ static bool CurrentCodeLocationValid() { (FunctionName && FunctionName[0] != '\0'); } -static void emitInstrumentationGeneral(uint32_t StreamID, uint64_t InstanceID, +void emitInstrumentationGeneral(uint32_t StreamID, uint64_t InstanceID, xpti_td *TraceEvent, uint16_t Type, const void *Addr) { if (!(xptiCheckTraceEnabled(StreamID, Type) && TraceEvent)) @@ -2424,7 +2424,7 @@ pi_int32 enqueueImpCommandBufferKernel( &getMemAllocationFunc](sycl::detail::ArgDesc &Arg, size_t NextTrueIndex) { sycl::detail::SetArgBasedOnType(Plugin, PiKernel, DeviceImageImpl, - getMemAllocationFunc, Ctx, false, Arg, + getMemAllocationFunc, Ctx, Arg, NextTrueIndex); }; // Copy args for modification @@ -3066,7 +3066,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { // submitted to report exception origin properly. copySubmissionCodeLocation(); - getThreadPool().submit( + queue_impl::getThreadPool().submit( DispatchHostTask(this, std::move(ReqToMem))); MShouldCompleteEventIfPossible = false; diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index ea2ba3ea72118..628ccdf2593da 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -33,7 +33,6 @@ class node_impl; namespace detail { #ifdef XPTI_ENABLE_INSTRUMENTATION -bool CurrentCodeLocationValid(); void emitInstrumentationGeneral(uint32_t StreamID, uint64_t InstanceID, xpti_td *TraceEvent, uint16_t Type, const void *Addr); @@ -793,7 +792,7 @@ void SetArgBasedOnType( const detail::plugin &Plugin, sycl::detail::pi::PiKernel Kernel, const std::shared_ptr &DeviceImageImpl, const std::function &getMemAllocationFunc, - const sycl::context &Context, bool IsHost, detail::ArgDesc &Arg, + const sycl::context &Context, detail::ArgDesc &Arg, size_t NextTrueIndex); void applyFuncOnFilteredArgs( From eebc51933df59666baad0bb50100cb02dce4e485 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 5 Jun 2024 09:34:20 -0700 Subject: [PATCH 19/58] almost almost buildable: enable build 4 Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/queue_impl.hpp | 2 +- sycl/source/detail/scheduler/commands.cpp | 3 ++- sycl/source/handler.cpp | 6 +++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 3bd7b6ea7ec0a..1315d32ecaa4f 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -844,7 +844,7 @@ class queue_impl { "function objects should use the sycl::handler API instead."); } - handler Handler(Self, PrimaryQueue, SecondaryQueue); + handler Handler(Self, PrimaryQueue, SecondaryQueue, false); Handler.saveCodeLoc(Loc); PreventSubmit = true; try { diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index d0a790ed97059..1683b874fba5d 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -402,8 +402,9 @@ class DispatchHostTask { try { // we're ready to call the user-defined lambda now if (HostTask.MHostTask->isInteropTask()) { + assert(HostTask.MQueue && "Submitted queue for host task must be device queue"); interop_handle IH{MReqToMem, HostTask.MQueue, - // HostTask.MQueue->getDeviceImplPtr(), + HostTask.MQueue->getDeviceImplPtr(), HostTask.MQueue->getContextImplPtr()}; HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo(), IH); diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index c0e0438d9cd2f..015d690d67e7d 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -80,12 +80,12 @@ void *getValueFromDynamicParameter( } // namespace detail -handler::handler(std::shared_ptr Queue) - : handler(Queue, Queue, nullptr) {} +handler::handler(std::shared_ptr Queue, bool) + : handler(Queue, Queue, nullptr, false) {} handler::handler(std::shared_ptr Queue, std::shared_ptr PrimaryQueue, - std::shared_ptr SecondaryQueue) + std::shared_ptr SecondaryQueue, bool) : MImpl(std::make_shared(std::move(PrimaryQueue), std::move(SecondaryQueue))), MQueue(std::move(Queue)) {} From c6fe5c8098daadcde4dd19241be937e146bf9a17 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Thu, 6 Jun 2024 10:12:13 -0700 Subject: [PATCH 20/58] buildable Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/device_impl.cpp | 7 ------- sycl/source/detail/device_impl.hpp | 5 ----- sycl/source/detail/stream_impl.cpp | 14 +++++--------- sycl/source/detail/stream_impl.hpp | 4 ---- 4 files changed, 5 insertions(+), 25 deletions(-) diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp index c677b9165d71f..ae3b04486d1ea 100644 --- a/sycl/source/detail/device_impl.cpp +++ b/sycl/source/detail/device_impl.cpp @@ -716,13 +716,6 @@ bool device_impl::has(aspect Aspect) const { PI_ERROR_INVALID_DEVICE); } -std::shared_ptr device_impl::getHostDeviceImpl() { - static std::shared_ptr HostImpl = - std::make_shared(); - - return HostImpl; -} - bool device_impl::isAssertFailSupported() const { return MIsAssertFailSupported; } diff --git a/sycl/source/detail/device_impl.hpp b/sycl/source/detail/device_impl.hpp index efec017d372f5..9249bbba59fe8 100644 --- a/sycl/source/detail/device_impl.hpp +++ b/sycl/source/detail/device_impl.hpp @@ -217,11 +217,6 @@ class device_impl { /// \return true if the SYCL device has the given feature. bool has(aspect Aspect) const; - /// Gets the single instance of the Host Device - /// - /// \return the host device_impl singleton - static std::shared_ptr getHostDeviceImpl(); - bool isAssertFailSupported() const; bool isRootDevice() const { return MRootDevice == nullptr; } diff --git a/sycl/source/detail/stream_impl.cpp b/sycl/source/detail/stream_impl.cpp index 4550b5cc26629..7268293433e82 100644 --- a/sycl/source/detail/stream_impl.cpp +++ b/sycl/source/detail/stream_impl.cpp @@ -94,12 +94,12 @@ void stream_impl::initStreamHost(QueueImplPtr Queue) { } void stream_impl::flush(const EventImplPtr &LeadEvent) { + assert(LeadEvent && "LeadEvent is expected to be not nullptr"); // We don't want stream flushing to be blocking operation that is why submit a // host task to print stream buffer. It will fire up as soon as the kernel // finishes execution. - auto Q = detail::createSyclObjFromImpl( - sycl::detail::Scheduler::getInstance().getDefaultHostQueue()); - event Event = Q.submit([&](handler &cgh) { + auto Q = LeadEvent->getSubmittedQueue(); + event Event = detail::createSyclObjFromImpl(Q).submit([&](handler &cgh) { auto BufHostAcc = Buf_.get_access( cgh, range<1>(BufferSize_), id<1>(OffsetSize)); @@ -131,14 +131,10 @@ void stream_impl::flush(const EventImplPtr &LeadEvent) { fflush(stdout); }); }); - if (LeadEvent) { - LeadEvent->attachEventToComplete(detail::getSyclObjImpl(Event)); - LeadEvent->getSubmittedQueue()->registerStreamServiceEvent( - detail::getSyclObjImpl(Event)); - } + LeadEvent->attachEventToComplete(detail::getSyclObjImpl(Event)); + Q->registerStreamServiceEvent(detail::getSyclObjImpl(Event)); } -void stream_impl::flush() { flush(nullptr); } } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/stream_impl.hpp b/sycl/source/detail/stream_impl.hpp index 823653016c162..cd3d503b4b894 100644 --- a/sycl/source/detail/stream_impl.hpp +++ b/sycl/source/detail/stream_impl.hpp @@ -49,10 +49,6 @@ class __SYCL_EXPORT stream_impl { // LeadEvent as well as in queue LeadEvent associated with. void flush(const EventImplPtr &LeadEvent); - // Enqueue task to copy stream buffer to the host and print the contents - // Remove during next ABI breaking window - void flush(); - size_t size() const noexcept; size_t get_work_item_buffer_size() const; From 24669e2a82d3765cc08800d4e8691e0c2bc5b28b Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Thu, 6 Jun 2024 10:52:53 -0700 Subject: [PATCH 21/58] RT-buildable: enabling UT build Signed-off-by: Tikhomirova, Kseniya --- sycl/unittests/scheduler/AllocaLinking.cpp | 13 +++---------- .../scheduler/CommandsWaitForEvents.cpp | 10 ++-------- .../scheduler/EnqueueWithDependsOnDeps.cpp | 3 +-- sycl/unittests/scheduler/GraphCleanup.cpp | 11 +++-------- sycl/unittests/scheduler/InOrderQueueDeps.cpp | 11 +++-------- sycl/unittests/scheduler/LeafLimit.cpp | 2 -- .../scheduler/LeafLimitDiffContexts.cpp | 2 +- sycl/unittests/scheduler/LeavesCollection.cpp | 9 ++++----- .../scheduler/LinkedAllocaDependencies.cpp | 14 ++++---------- .../scheduler/NoHostUnifiedMemory.cpp | 19 +++++++------------ sycl/unittests/scheduler/QueueFlushing.cpp | 10 +++------- .../scheduler/SchedulerTestUtils.hpp | 3 +-- .../scheduler/StreamInitDependencyOnHost.cpp | 9 +++------ 13 files changed, 35 insertions(+), 81 deletions(-) diff --git a/sycl/unittests/scheduler/AllocaLinking.cpp b/sycl/unittests/scheduler/AllocaLinking.cpp index a77995a203da3..e15cf24761ee1 100644 --- a/sycl/unittests/scheduler/AllocaLinking.cpp +++ b/sycl/unittests/scheduler/AllocaLinking.cpp @@ -47,13 +47,6 @@ static pi_result redefinedDeviceGetInfoAfter(pi_device Device, TEST_F(SchedulerTest, AllocaLinking) { HostUnifiedMemory = false; - // This host device constructor should be placed before Mock.redefine - // because it overrides the real implementation of get_device_info - // which is needed when creating a host device. - device HostDevice = detail::createSyclObjFromImpl( - detail::device_impl::getHostDeviceImpl()); - std::shared_ptr DefaultHostQueue{ - new detail::queue_impl(detail::getSyclObjImpl(HostDevice), {}, {})}; sycl::unittest::PiMock Mock; sycl::queue Q{Mock.getPlatform().get_devices()[0]}; @@ -73,7 +66,7 @@ TEST_F(SchedulerTest, AllocaLinking) { detail::AllocaCommandBase *NonHostAllocaCmd = MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds); detail::AllocaCommandBase *HostAllocaCmd = - MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue, AuxCmds); + MS.getOrCreateAllocaForReq(Record, &Req, nullptr, AuxCmds); EXPECT_FALSE(HostAllocaCmd->MLinkedAllocaCmd); EXPECT_FALSE(NonHostAllocaCmd->MLinkedAllocaCmd); @@ -90,7 +83,7 @@ TEST_F(SchedulerTest, AllocaLinking) { detail::AllocaCommandBase *NonHostAllocaCmd = MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds); detail::AllocaCommandBase *HostAllocaCmd = - MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue, AuxCmds); + MS.getOrCreateAllocaForReq(Record, &Req, nullptr, AuxCmds); EXPECT_EQ(HostAllocaCmd->MLinkedAllocaCmd, NonHostAllocaCmd); EXPECT_EQ(NonHostAllocaCmd->MLinkedAllocaCmd, HostAllocaCmd); @@ -107,7 +100,7 @@ TEST_F(SchedulerTest, AllocaLinking) { detail::AllocaCommandBase *NonHostAllocaCmd = MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds); detail::AllocaCommandBase *HostAllocaCmd = - MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue, AuxCmds); + MS.getOrCreateAllocaForReq(Record, &Req, nullptr, AuxCmds); EXPECT_EQ(HostAllocaCmd->MLinkedAllocaCmd, NonHostAllocaCmd); EXPECT_EQ(NonHostAllocaCmd->MLinkedAllocaCmd, HostAllocaCmd); diff --git a/sycl/unittests/scheduler/CommandsWaitForEvents.cpp b/sycl/unittests/scheduler/CommandsWaitForEvents.cpp index d893c33f5cc26..499a45d0fe70f 100644 --- a/sycl/unittests/scheduler/CommandsWaitForEvents.cpp +++ b/sycl/unittests/scheduler/CommandsWaitForEvents.cpp @@ -219,13 +219,7 @@ TEST_F(SchedulerTest, CommandsWaitForEvents) { std::shared_ptr E2( new detail::event_impl(TestContext->EventCtx2, Q2.get_context())); - device HostDevice = detail::createSyclObjFromImpl( - detail::device_impl::getHostDeviceImpl()); - std::shared_ptr DefaultHostQueue(new detail::queue_impl( - detail::getSyclObjImpl(HostDevice), /*AsyncHandler=*/{}, - /*PropList=*/{})); - - MockCommand Cmd(DefaultHostQueue); + MockCommand Cmd(nullptr); std::vector> Events; Events.push_back(E1); @@ -233,7 +227,7 @@ TEST_F(SchedulerTest, CommandsWaitForEvents) { pi_event EventResult = nullptr; - Cmd.waitForEventsCall(DefaultHostQueue, Events, EventResult); + Cmd.waitForEventsCall(nullptr, Events, EventResult); ASSERT_TRUE(TestContext->EventCtx1WasWaited && TestContext->EventCtx2WasWaited) diff --git a/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp b/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp index fc816d1a4f3af..bd7531c964716 100644 --- a/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp +++ b/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp @@ -83,7 +83,7 @@ class DependsOnTests : public ::testing::Test { detail::Command *NewCmd = MS.addCG( std::move(CmdGroup), - Type == TestCGType::HOST_TASK ? MS.getDefaultHostQueue() : QueueDevImpl, + Type == TestCGType::HOST_TASK ? nullptr : QueueDevImpl, ToEnqueue); EXPECT_EQ(ToEnqueue.size(), 0u); return NewCmd; @@ -167,7 +167,6 @@ class DependsOnTests : public ::testing::Test { TEST_F(DependsOnTests, EnqueueNoMemObjTwoHostTasks) { // Checks enqueue of two dependent host tasks - detail::QueueImplPtr QueueHostImpl = MS.getDefaultHostQueue(); std::vector Events; detail::Command *Cmd1 = diff --git a/sycl/unittests/scheduler/GraphCleanup.cpp b/sycl/unittests/scheduler/GraphCleanup.cpp index 3389769569e5e..e0ec582db065c 100644 --- a/sycl/unittests/scheduler/GraphCleanup.cpp +++ b/sycl/unittests/scheduler/GraphCleanup.cpp @@ -172,7 +172,7 @@ static void checkCleanupOnEnqueue(MockScheduler &MS, } static void checkCleanupOnLeafUpdate( - MockScheduler &MS, detail::QueueImplPtr &QueueImpl, buffer &Buf, + MockScheduler &MS, detail::QueueImplPtr QueueImpl, buffer &Buf, detail::Requirement &MockReq, std::function SchedulerCall) { bool CommandDeleted = false; @@ -247,15 +247,10 @@ TEST_F(SchedulerTest, PostEnqueueCleanup) { checkCleanupOnLeafUpdate( MS, QueueImpl, Buf, MockReq, [&](detail::MemObjRecord *Record) { detail::Command *Leaf = *Record->MWriteLeaves.begin(); - MS.addEmptyCmd(Leaf, {&MockReq}, QueueImpl, - detail::Command::BlockReason::HostTask, ToEnqueue); + MS.addEmptyCmd(Leaf, {&MockReq}, detail::Command::BlockReason::HostTask, ToEnqueue); }); - device HostDevice = detail::createSyclObjFromImpl( - detail::device_impl::getHostDeviceImpl()); - detail::QueueImplPtr DefaultHostQueue{ - new detail::queue_impl(detail::getSyclObjImpl(HostDevice), {}, {})}; checkCleanupOnLeafUpdate( - MS, DefaultHostQueue, Buf, MockReq, [&](detail::MemObjRecord *Record) { + MS, nullptr, Buf, MockReq, [&](detail::MemObjRecord *Record) { MS.getOrCreateAllocaForReq(Record, &MockReq, QueueImpl, ToEnqueue); }); // Check cleanup on exceeding leaf limit. diff --git a/sycl/unittests/scheduler/InOrderQueueDeps.cpp b/sycl/unittests/scheduler/InOrderQueueDeps.cpp index 337ef2ef3d403..c19b494f9c484 100644 --- a/sycl/unittests/scheduler/InOrderQueueDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueDeps.cpp @@ -77,11 +77,6 @@ TEST_F(SchedulerTest, InOrderQueueDeps) { sycl::detail::QueueImplPtr InOrderQueueImpl = detail::getSyclObjImpl(InOrderQueue); - device HostDevice = detail::createSyclObjFromImpl( - detail::device_impl::getHostDeviceImpl()); - std::shared_ptr DefaultHostQueue{ - new detail::queue_impl(detail::getSyclObjImpl(HostDevice), {}, {})}; - MockScheduler MS; int val; @@ -92,18 +87,18 @@ TEST_F(SchedulerTest, InOrderQueueDeps) { detail::MemObjRecord *Record = MS.getOrInsertMemObjRecord(InOrderQueueImpl, &Req, AuxCmds); MS.getOrCreateAllocaForReq(Record, &Req, InOrderQueueImpl, AuxCmds); - MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue, AuxCmds); + MS.getOrCreateAllocaForReq(Record, &Req, nullptr, AuxCmds); // Check that sequential memory movements submitted to the same in-order // queue do not depend on each other. detail::Command *Cmd = - MS.insertMemoryMove(Record, &Req, DefaultHostQueue, AuxCmds); + MS.insertMemoryMove(Record, &Req, nullptr, AuxCmds); detail::EnqueueResultT Res; auto ReadLock = MS.acquireGraphReadLock(); MockScheduler::enqueueCommand(Cmd, Res, detail::NON_BLOCKING); Cmd = MS.insertMemoryMove(Record, &Req, InOrderQueueImpl, AuxCmds); MockScheduler::enqueueCommand(Cmd, Res, detail::NON_BLOCKING); - Cmd = MS.insertMemoryMove(Record, &Req, DefaultHostQueue, AuxCmds); + Cmd = MS.insertMemoryMove(Record, &Req, nullptr, AuxCmds); MockScheduler::enqueueCommand(Cmd, Res, detail::NON_BLOCKING); } diff --git a/sycl/unittests/scheduler/LeafLimit.cpp b/sycl/unittests/scheduler/LeafLimit.cpp index 36d8f459a324a..f3417b297bc31 100644 --- a/sycl/unittests/scheduler/LeafLimit.cpp +++ b/sycl/unittests/scheduler/LeafLimit.cpp @@ -36,8 +36,6 @@ TEST_F(SchedulerTest, LeafLimit) { unittest::ScopedEnvVar DisabledCleanup{ DisableCleanupName, "1", detail::SYCLConfig::reset}; - sycl::queue HQueue(detail::createSyclObjFromImpl( - detail::device_impl::getHostDeviceImpl())); MockScheduler MS; std::vector> LeavesToAdd; std::unique_ptr MockDepCmd; diff --git a/sycl/unittests/scheduler/LeafLimitDiffContexts.cpp b/sycl/unittests/scheduler/LeafLimitDiffContexts.cpp index 38d9ac784c09f..1af882a423af8 100644 --- a/sycl/unittests/scheduler/LeafLimitDiffContexts.cpp +++ b/sycl/unittests/scheduler/LeafLimitDiffContexts.cpp @@ -61,7 +61,7 @@ TEST_F(SchedulerTest, LeafLimitDiffContexts) { AllocaCmd = MS.getOrCreateAllocaForReq( Rec, &MockReq, detail::getSyclObjImpl(Queue), ToEnqueue); std::ignore = MS.getOrCreateAllocaForReq( - Rec, &MockReq, MS.getDefaultHostQueue(), ToEnqueue); + Rec, &MockReq, nullptr, ToEnqueue); DepCmd = std::make_unique(detail::getSyclObjImpl(Queue), MockReq); } diff --git a/sycl/unittests/scheduler/LeavesCollection.cpp b/sycl/unittests/scheduler/LeavesCollection.cpp index ea883041add66..39146ffaa95e8 100644 --- a/sycl/unittests/scheduler/LeavesCollection.cpp +++ b/sycl/unittests/scheduler/LeavesCollection.cpp @@ -37,9 +37,8 @@ createGenericCommand(const std::shared_ptr &Q) { } std::shared_ptr -createEmptyCommand(const std::shared_ptr &Q, - const Requirement &Req) { - EmptyCommand *Cmd = new EmptyCommand(Q); +createEmptyCommand(const Requirement &Req) { + EmptyCommand *Cmd = new EmptyCommand(); Cmd->addRequirement(/* DepCmd = */ nullptr, /* AllocaCmd = */ nullptr, &Req); Cmd->MBlockReason = Command::BlockReason::HostAccessor; return std::shared_ptr{Cmd}; @@ -97,7 +96,7 @@ TEST_F(LeavesCollectionTest, PushBack) { for (size_t Idx = 0; Idx < GenericCmdsCapacity * 4; ++Idx) { auto Cmd = Idx % 2 ? createGenericCommand(getSyclObjImpl(Q)) - : createEmptyCommand(getSyclObjImpl(Q), MockReq); + : createEmptyCommand(MockReq); Cmds.push_back(Cmd); LE.push_back(Cmds.back().get(), ToEnqueue); @@ -137,7 +136,7 @@ TEST_F(LeavesCollectionTest, Remove) { for (size_t Idx = 0; Idx < GenericCmdsCapacity * 4; ++Idx) { auto Cmd = Idx % 2 ? createGenericCommand(getSyclObjImpl(Q)) - : createEmptyCommand(getSyclObjImpl(Q), MockReq); + : createEmptyCommand(MockReq); Cmds.push_back(Cmd); if (LE.push_back(Cmds.back().get(), ToEnqueue)) diff --git a/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp b/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp index 5ab9cfbb43f5a..6ae6b9bfc2344 100644 --- a/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp +++ b/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp @@ -64,28 +64,22 @@ TEST_F(SchedulerTest, LinkedAllocaDependencies) { sycl::queue Queue1{Dev}; sycl::detail::QueueImplPtr Q1 = sycl::detail::getSyclObjImpl(Queue1); - device HostDevice = detail::createSyclObjFromImpl( - detail::device_impl::getHostDeviceImpl()); - std::shared_ptr DefaultHostQueue(new detail::queue_impl( - detail::getSyclObjImpl(HostDevice), /*AsyncHandler=*/{}, - /*PropList=*/{})); - auto AllocaDep = [](sycl::detail::Command *, sycl::detail::Command *, sycl::detail::MemObjRecord *, std::vector &) {}; std::shared_ptr Record{ - new sycl::detail::MemObjRecord(DefaultHostQueue->getContextImplPtr(), 10, + new sycl::detail::MemObjRecord(nullptr, 10, AllocaDep)}; MemObjMock MemObj(Record); Req.MSYCLMemObj = &MemObj; - sycl::detail::AllocaCommand AllocaCmd1(DefaultHostQueue, Req, false); + sycl::detail::AllocaCommand AllocaCmd1(nullptr, Req, false); Record->MAllocaCommands.push_back(&AllocaCmd1); - MockCommand DepCmd(DefaultHostQueue, Req); - MockCommand DepDepCmd(DefaultHostQueue, Req); + MockCommand DepCmd(nullptr, Req); + MockCommand DepDepCmd(nullptr, Req); DepCmd.MDeps.push_back({&DepDepCmd, DepDepCmd.getRequirement(), &AllocaCmd1}); DepDepCmd.MUsers.insert(&DepCmd); std::vector ToEnqueue; diff --git a/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp b/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp index 635a8e9c3389c..20cf879d53daf 100644 --- a/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp +++ b/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp @@ -91,11 +91,6 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) { redefinedMemCreateWithNativeHandle); sycl::detail::QueueImplPtr QImpl = detail::getSyclObjImpl(Q); - device HostDevice = detail::createSyclObjFromImpl( - detail::device_impl::getHostDeviceImpl()); - std::shared_ptr DefaultHostQueue{ - new detail::queue_impl(detail::getSyclObjImpl(HostDevice), {}, {})}; - MockScheduler MS; // Check non-host alloca with non-discard access mode { @@ -113,10 +108,10 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) { // order to perform a memory move. EXPECT_EQ(Record->MAllocaCommands.size(), 2U); detail::AllocaCommandBase *HostAllocaCmd = Record->MAllocaCommands[0]; - EXPECT_TRUE(HostAllocaCmd->getQueue()->is_host()); + EXPECT_TRUE(HostAllocaCmd->getQueue() == nullptr); EXPECT_TRUE(!HostAllocaCmd->MLinkedAllocaCmd); EXPECT_TRUE(!NonHostAllocaCmd->MLinkedAllocaCmd); - EXPECT_TRUE(Record->MCurContext->is_host()); + EXPECT_TRUE(Record->MCurContext == nullptr); detail::Command *MemoryMove = MS.insertMemoryMove(Record, &Req, QImpl, AuxCmds); @@ -162,9 +157,9 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) { // another and the transfer is done via a write operation. std::vector AuxCmds; detail::MemObjRecord *Record = - MS.getOrInsertMemObjRecord(DefaultHostQueue, &Req, AuxCmds); + MS.getOrInsertMemObjRecord(nullptr, &Req, AuxCmds); detail::AllocaCommandBase *HostAllocaCmd = - MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue, AuxCmds); + MS.getOrCreateAllocaForReq(Record, &Req, nullptr, AuxCmds); EXPECT_EQ(Record->MAllocaCommands.size(), 1U); detail::AllocaCommandBase *NonHostAllocaCmd = MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds); @@ -190,14 +185,14 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) { detail::MemObjRecord *Record = MS.getOrInsertMemObjRecord(QImpl, &Req, AuxCmds); MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds); - MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue, AuxCmds); + MS.getOrCreateAllocaForReq(Record, &Req, nullptr, AuxCmds); // Memory movement operations should be omitted for discard access modes. detail::Command *MemoryMove = - MS.insertMemoryMove(Record, &DiscardReq, DefaultHostQueue, AuxCmds); + MS.insertMemoryMove(Record, &DiscardReq, nullptr, AuxCmds); EXPECT_TRUE(MemoryMove == nullptr); // The current context for the record should still be modified. - EXPECT_EQ(Record->MCurContext, DefaultHostQueue->getContextImplPtr()); + EXPECT_EQ(Record->MCurContext, nullptr); } // Check that interoperability memory objects are initialized. { diff --git a/sycl/unittests/scheduler/QueueFlushing.cpp b/sycl/unittests/scheduler/QueueFlushing.cpp index c97428b9d55c6..330ff7e0f02d2 100644 --- a/sycl/unittests/scheduler/QueueFlushing.cpp +++ b/sycl/unittests/scheduler/QueueFlushing.cpp @@ -122,21 +122,17 @@ TEST_F(SchedulerTest, QueueFlushing) { QueueImplA}; testCommandEnqueue(&UnmapCmd, QueueImplB, MockReq); - device HostDevice = detail::createSyclObjFromImpl( - detail::device_impl::getHostDeviceImpl()); - detail::QueueImplPtr DefaultHostQueue{ - new detail::queue_impl(detail::getSyclObjImpl(HostDevice), {}, {})}; detail::AllocaCommand HostAllocaCmd = - detail::AllocaCommand(DefaultHostQueue, MockReq); + detail::AllocaCommand(nullptr, MockReq); detail::MemCpyCommand MemCpyCmd{MockReq, &AllocaCmd, MockReq, &HostAllocaCmd, - QueueImplA, DefaultHostQueue}; + QueueImplA, nullptr}; testCommandEnqueue(&MemCpyCmd, QueueImplB, MockReq); detail::MemCpyCommandHost MemCpyCmdHost{MockReq, &AllocaCmd, MockReq, &MockHostPtr, - QueueImplA, DefaultHostQueue}; + QueueImplA, nullptr}; testCommandEnqueue(&MemCpyCmdHost, QueueImplB, MockReq); std::unique_ptr CG{ diff --git a/sycl/unittests/scheduler/SchedulerTestUtils.hpp b/sycl/unittests/scheduler/SchedulerTestUtils.hpp index 88ced1f25904a..20f82f9165c01 100644 --- a/sycl/unittests/scheduler/SchedulerTestUtils.hpp +++ b/sycl/unittests/scheduler/SchedulerTestUtils.hpp @@ -189,10 +189,9 @@ class MockScheduler : public sycl::detail::Scheduler { sycl::detail::EmptyCommand * addEmptyCmd(sycl::detail::Command *Cmd, const std::vector &Reqs, - const sycl::detail::QueueImplPtr &Queue, sycl::detail::Command::BlockReason Reason, std::vector &ToEnqueue) { - return MGraphBuilder.addEmptyCmd(Cmd, Reqs, Queue, Reason, ToEnqueue); + return MGraphBuilder.addEmptyCmd(Cmd, Reqs, Reason, ToEnqueue); } sycl::detail::Command * diff --git a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp index 18c0b3e1a8070..838b60809472c 100644 --- a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp +++ b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp @@ -80,12 +80,9 @@ TEST_F(SchedulerTest, StreamInitDependencyOnHost) { unittest::ScopedEnvVar DisabledCleanup{ DisableCleanupName, "1", detail::SYCLConfig::reset}; - std::shared_ptr HQueueImpl(new detail::queue_impl( - detail::device_impl::getHostDeviceImpl(), /*AsyncHandler=*/{}, - /*PropList=*/{})); // Emulating processing of command group function - MockHandlerStreamInit MockCGH(HQueueImpl, true); + MockHandlerStreamInit MockCGH(nullptr, true); MockCGH.setType(detail::CG::Kernel); auto EmptyKernel = [](sycl::nd_item<1>) {}; @@ -114,11 +111,11 @@ TEST_F(SchedulerTest, StreamInitDependencyOnHost) { static_cast(MainCG.get())->getStreams(); ASSERT_EQ(Streams.size(), 1u) << "Invalid number of stream objects"; - Streams[0]->initStreamHost(HQueueImpl); + Streams[0]->initStreamHost(nullptr); MockScheduler MS; std::vector AuxCmds; - detail::Command *NewCmd = MS.addCG(std::move(MainCG), HQueueImpl, AuxCmds); + detail::Command *NewCmd = MS.addCG(std::move(MainCG), nullptr, AuxCmds); ASSERT_TRUE(!!NewCmd) << "Failed to add command group into scheduler"; ASSERT_GT(NewCmd->MDeps.size(), 0u) << "No deps appeared in the new exec kernel command"; From fcc7748699821b8a53db059de50b94dff5f96232 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Fri, 7 Jun 2024 03:42:25 -0700 Subject: [PATCH 22/58] RT-buildable: restore incorrectly deleted code Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/memory_manager.cpp | 28 ++++++++++++++-- sycl/source/detail/memory_manager.hpp | 3 ++ sycl/source/detail/scheduler/commands.cpp | 41 +++++++++++++++++++---- 3 files changed, 63 insertions(+), 9 deletions(-) diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index e2c22f794f587..461cf8b85915c 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -266,6 +266,11 @@ void MemoryManager::releaseMemObj(ContextImplPtr TargetContext, return; } + if (!TargetContext) { + MemObj->releaseHostMem(MemAllocation); + return; + } + const PluginPtr &Plugin = TargetContext->getPlugin(); memReleaseHelper(Plugin, pi::cast(MemAllocation)); } @@ -283,6 +288,19 @@ void *MemoryManager::allocate(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, OutEvent); } +void *MemoryManager::allocateHostMemory(SYCLMemObjI *MemObj, void *UserPtr, + bool HostPtrReadOnly, size_t Size, + const sycl::property_list &) { + std::ignore = HostPtrReadOnly; + std::ignore = Size; + + // Can return user pointer directly if it is not a nullptr. + if (UserPtr) + return UserPtr; + + return MemObj->allocateHostMem(); +} + void *MemoryManager::allocateInteropMemObject( ContextImplPtr TargetContext, void *UserPtr, const EventImplPtr &InteropEvent, const ContextImplPtr &InteropContext, @@ -379,9 +397,10 @@ void *MemoryManager::allocateMemBuffer( const ContextImplPtr &InteropContext, const sycl::property_list &PropsList, sycl::detail::pi::PiEvent &OutEventToWait) { void *MemPtr; - if (UserPtr && InteropContext) - MemPtr = - allocateInteropMemObject(TargetContext, UserPtr, InteropEvent, + if (!TargetContext) + MemPtr = allocateHostMemory(MemObj, UserPtr, HostPtrReadOnly, Size, PropsList); + else if (UserPtr && InteropContext) + MemPtr = allocateInteropMemObject(TargetContext, UserPtr, InteropEvent, InteropContext, PropsList, OutEventToWait); else MemPtr = allocateBufferObject(TargetContext, UserPtr, HostPtrReadOnly, Size, @@ -398,6 +417,9 @@ void *MemoryManager::allocateMemImage( const EventImplPtr &InteropEvent, const ContextImplPtr &InteropContext, const sycl::property_list &PropsList, sycl::detail::pi::PiEvent &OutEventToWait) { + if (!TargetContext) + return allocateHostMemory(MemObj, UserPtr, HostPtrReadOnly, Size, + PropsList); if (UserPtr && InteropContext) return allocateInteropMemObject(TargetContext, UserPtr, InteropEvent, InteropContext, PropsList, OutEventToWait); diff --git a/sycl/source/detail/memory_manager.hpp b/sycl/source/detail/memory_manager.hpp index 7be17898bc0d9..deefda9ccd8ff 100644 --- a/sycl/source/detail/memory_manager.hpp +++ b/sycl/source/detail/memory_manager.hpp @@ -85,6 +85,9 @@ class __SYCL_EXPORT MemoryManager { static void releaseMemObj(ContextImplPtr TargetContext, SYCLMemObjI *MemObj, void *MemAllocation, void *UserPtr); + static void *allocateHostMemory(SYCLMemObjI *MemObj, void *UserPtr, + bool HostPtrReadOnly, size_t Size, + const sycl::property_list &PropsList); static void * allocateInteropMemObject(ContextImplPtr TargetContext, void *UserPtr, const EventImplPtr &InteropEvent, diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 1683b874fba5d..b1713473f2de3 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -459,8 +459,38 @@ void Command::waitForPreparedHostEvents() const { void Command::waitForEvents(QueueImplPtr Queue, std::vector &EventImpls, sycl::detail::pi::PiEvent &Event) { - assert(Queue && "Device queue is expected here"); if (!EventImpls.empty()) { + if (!Queue) { + // Host queue can wait for events from different contexts, i.e. it may + // contain events with different contexts in its MPreparedDepsEvents. + // OpenCL 2.1 spec says that clWaitForEvents will return + // CL_INVALID_CONTEXT if events specified in the list do not belong to + // the same context. Thus we split all the events into per-context map. + // An example. We have two queues for the same CPU device: Q1, Q2. Thus + // we will have two different contexts for the same CPU device: C1, C2. + // Also we have default host queue. This queue is accessible via + // Scheduler. Now, let's assume we have three different events: E1(C1), + // E2(C1), E3(C2). The command's MPreparedDepsEvents will contain all + // three events (E1, E2, E3). Now, if piEventsWait is called for all + // three events we'll experience failure with CL_INVALID_CONTEXT 'cause + // these events refer to different contexts. + std::map> + RequiredEventsPerContext; + + for (const EventImplPtr &Event : EventImpls) { + ContextImplPtr Context = Event->getContextImpl(); + assert(Context.get() && + "Only non-host events are expected to be waited for here"); + RequiredEventsPerContext[Context.get()].push_back(Event); + } + + for (auto &CtxWithEvents : RequiredEventsPerContext) { + std::vector RawEvents = + getPiEvents(CtxWithEvents.second); + CtxWithEvents.first->getPlugin()->call( + RawEvents.size(), RawEvents.data()); + } + } else { #ifndef NDEBUG for (const EventImplPtr &Event : EventImpls) assert(!Event->isHost() && @@ -477,6 +507,7 @@ void Command::waitForEvents(QueueImplPtr Queue, Plugin->call( Queue->getHandleRef(), RawEvents.size(), &RawEvents[0], &Event); } + } } /// It is safe to bind MPreparedDepsEvents and MPreparedHostDepsEvents @@ -700,13 +731,11 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, ContextImplPtr DepEventContext = DepEvent->getContextImpl(); // If contexts don't match we'll connect them using host task - if (DepEventContext == WorkerContext) - MPreparedDepsEvents.push_back(std::move(DepEvent)); - else - { + if (DepEventContext != WorkerContext && WorkerContext){ Scheduler::GraphBuilder &GB = Scheduler::getInstance().MGraphBuilder; ConnectionCmd = GB.connectDepEvent(this, DepEvent, Dep, ToCleanUp); - } + } else + MPreparedDepsEvents.push_back(std::move(DepEvent)); return ConnectionCmd; } From 7aa76d9f1e51eb430909125e9c4acc54518c7e81 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Fri, 7 Jun 2024 05:59:28 -0700 Subject: [PATCH 23/58] RT buildable: check-sycl-AccessorTests passed Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/event_impl.cpp | 2 +- sycl/source/detail/event_impl.hpp | 4 ++-- sycl/source/detail/scheduler/commands.cpp | 17 +++++++++-------- sycl/source/detail/sycl_mem_obj_t.cpp | 2 +- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index e34597aa008d1..e38c15e04879a 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -566,7 +566,7 @@ void event_impl::setCommand(void *Cmd) { MCommand = Cmd; auto TypedCommand = static_cast(Cmd); if (TypedCommand) - MIsHostTask = TypedCommand->isHostTask(); + MIsHostEvent = TypedCommand->getWorkerContext() == nullptr; } } // namespace detail diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index 7c1eb99e3b286..237939ea37bd8 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -337,7 +337,7 @@ class event_impl { void setEnqueued() { MIsEnqueued = true; } - bool isHost() { return MIsHostTask; } + bool isHost() { return MIsHostEvent; } protected: // When instrumentation is enabled emits trace event for event wait begin and @@ -406,7 +406,7 @@ class event_impl { std::shared_ptr Context); std::atomic_bool MIsEnqueued{false}; - bool MIsHostTask{false}; + bool MIsHostEvent{false}; }; } // namespace detail diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index b1713473f2de3..f7b9805ff17ec 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -459,6 +459,11 @@ void Command::waitForPreparedHostEvents() const { void Command::waitForEvents(QueueImplPtr Queue, std::vector &EventImpls, sycl::detail::pi::PiEvent &Event) { + #ifndef NDEBUG + for (const EventImplPtr &Event : EventImpls) + assert(!Event->isHost() && + "Only non-host events are expected to be waited for here"); +#endif if (!EventImpls.empty()) { if (!Queue) { // Host queue can wait for events from different contexts, i.e. it may @@ -491,12 +496,6 @@ void Command::waitForEvents(QueueImplPtr Queue, RawEvents.size(), RawEvents.data()); } } else { -#ifndef NDEBUG - for (const EventImplPtr &Event : EventImpls) - assert(!Event->isHost() && - "Only non-host events are expected to be waited for here"); -#endif - std::vector RawEvents = getPiEvents(EventImpls); flushCrossQueueDeps(EventImpls, MWorkerQueue); @@ -1488,7 +1487,8 @@ void MemCpyCommand::emitInstrumentationData() { } ContextImplPtr MemCpyCommand::getWorkerContext() const { - assert(MWorkerQueue && "Worker queue for mem cpy command must be not nullptr"); + if (!MWorkerQueue) + return nullptr; return MWorkerQueue->getContextImplPtr(); } @@ -1661,7 +1661,8 @@ void MemCpyCommandHost::emitInstrumentationData() { } ContextImplPtr MemCpyCommandHost::getWorkerContext() const { - assert(MWorkerQueue && "Worker queue for mem cpy host command must be not nullptr"); + if (!MWorkerQueue) + return nullptr; return MWorkerQueue->getContextImplPtr(); } diff --git a/sycl/source/detail/sycl_mem_obj_t.cpp b/sycl/source/detail/sycl_mem_obj_t.cpp index 87f005fe8ca78..a95b9b43d7f5c 100644 --- a/sycl/source/detail/sycl_mem_obj_t.cpp +++ b/sycl/source/detail/sycl_mem_obj_t.cpp @@ -209,7 +209,7 @@ void SYCLMemObjT::detachMemoryObject( !MOwnNativeHandle || (MInteropContext && !MInteropContext->isOwnedByRuntime()); - if (MRecord && MRecord->MCurContext->isOwnedByRuntime() && + if (MRecord && MRecord->MCurContext && MRecord->MCurContext->isOwnedByRuntime() && !InteropObjectsUsed && (!MHostPtrProvided || MIsInternal)) Scheduler::getInstance().deferMemObjRelease(Self); } From dc4a94ea111456a188ec60eaeef7ff9a053bf3bd Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Fri, 7 Jun 2024 06:28:04 -0700 Subject: [PATCH 24/58] RT-buildable: enable unittests 2 Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/event_impl.cpp | 3 ++- sycl/source/detail/scheduler/scheduler.cpp | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index e38c15e04879a..8f676a97f187d 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -81,7 +81,7 @@ void event_impl::waitInternal(bool *Success) { } void event_impl::setComplete() { - if (!MEvent) { + if (MIsHostEvent || !MEvent) { { std::unique_lock lock(MMutex); #ifndef NDEBUG @@ -126,6 +126,7 @@ const PluginPtr &event_impl::getPlugin() { void event_impl::setStateIncomplete() { MState = HES_NotComplete; } void event_impl::setContextImpl(const ContextImplPtr &Context) { + MIsHostEvent = Context == nullptr; MContext = Context; MIsContextInitialized = true; } diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index 7e5db05daf01a..d3fe7b523e689 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -459,7 +459,8 @@ void Scheduler::NotifyHostTaskCompletion(Command *Cmd) { std::vector ToCleanUp; auto CmdEvent = Cmd->getEvent(); - auto QueueImpl = Cmd->getQueue(); + auto QueueImpl = CmdEvent->getSubmittedQueue(); + assert(QueueImpl && "Submitted queue for host task must not be null"); { ReadLockT Lock = acquireReadLock(); From 8c57888b2a5a733d248322287e599d0f08855444 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Fri, 7 Jun 2024 08:52:24 -0700 Subject: [PATCH 25/58] RT-buildable: unittests enabling 3 Signed-off-by: Tikhomirova, Kseniya --- .../source/detail/scheduler/graph_builder.cpp | 2 +- sycl/source/detail/stream_impl.cpp | 70 +++++++++---------- .../scheduler/StreamInitDependencyOnHost.cpp | 10 ++- 3 files changed, 44 insertions(+), 38 deletions(-) diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 8778ad6927c3e..6d3fbdd157618 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -1342,7 +1342,7 @@ Command *Scheduler::GraphBuilder::connectDepEvent( CG::CodeplayHostTask, /* Payload */ {})); ConnectCmd = new ExecCGCommand( - std::move(ConnectCG), Cmd->getQueue()); + std::move(ConnectCG), nullptr); } catch (const std::bad_alloc &) { throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY); } diff --git a/sycl/source/detail/stream_impl.cpp b/sycl/source/detail/stream_impl.cpp index 7268293433e82..cb46510551a30 100644 --- a/sycl/source/detail/stream_impl.cpp +++ b/sycl/source/detail/stream_impl.cpp @@ -98,41 +98,41 @@ void stream_impl::flush(const EventImplPtr &LeadEvent) { // We don't want stream flushing to be blocking operation that is why submit a // host task to print stream buffer. It will fire up as soon as the kernel // finishes execution. - auto Q = LeadEvent->getSubmittedQueue(); - event Event = detail::createSyclObjFromImpl(Q).submit([&](handler &cgh) { - auto BufHostAcc = - Buf_.get_access( - cgh, range<1>(BufferSize_), id<1>(OffsetSize)); - // Create accessor to the flush buffer even if not using it yet. Otherwise - // kernel will be a leaf for the flush buffer and scheduler will not be able - // to cleanup the kernel. TODO: get rid of finalize method by using host - // accessor to the flush buffer. - auto FlushBufHostAcc = - FlushBuf_ - .get_access( - cgh); - cgh.host_task([=] { - if (!BufHostAcc.empty()) { - // SYCL 2020, 4.16: - // > If the totalBufferSize or workItemBufferSize limits are exceeded, - // > it is implementation-defined whether the streamed characters - // > exceeding the limit are output, or silently ignored/discarded, and - // > if output it is implementation-defined whether those extra - // > characters exceeding the workItemBufferSize limit count toward the - // > totalBufferSize limit. Regardless of this implementation defined - // > behavior of output exceeding the limits, no undefined or erroneous - // > behavior is permitted of an implementation when the limits are - // > exceeded. - // - // Defend against zero-sized buffers (although they'd have no practical - // use). - printf("%s", &(BufHostAcc[0])); - } - fflush(stdout); - }); - }); - LeadEvent->attachEventToComplete(detail::getSyclObjImpl(Event)); - Q->registerStreamServiceEvent(detail::getSyclObjImpl(Event)); + // auto Q = LeadEvent->getSubmittedQueue(); + // event Event = detail::createSyclObjFromImpl(Q).submit([&](handler &cgh) { + // auto BufHostAcc = + // Buf_.get_access( + // cgh, range<1>(BufferSize_), id<1>(OffsetSize)); + // // Create accessor to the flush buffer even if not using it yet. Otherwise + // // kernel will be a leaf for the flush buffer and scheduler will not be able + // // to cleanup the kernel. TODO: get rid of finalize method by using host + // // accessor to the flush buffer. + // auto FlushBufHostAcc = + // FlushBuf_ + // .get_access( + // cgh); + // cgh.host_task([=] { + // if (!BufHostAcc.empty()) { + // // SYCL 2020, 4.16: + // // > If the totalBufferSize or workItemBufferSize limits are exceeded, + // // > it is implementation-defined whether the streamed characters + // // > exceeding the limit are output, or silently ignored/discarded, and + // // > if output it is implementation-defined whether those extra + // // > characters exceeding the workItemBufferSize limit count toward the + // // > totalBufferSize limit. Regardless of this implementation defined + // // > behavior of output exceeding the limits, no undefined or erroneous + // // > behavior is permitted of an implementation when the limits are + // // > exceeded. + // // + // // Defend against zero-sized buffers (although they'd have no practical + // // use). + // printf("%s", &(BufHostAcc[0])); + // } + // fflush(stdout); + // }); + // }); + // LeadEvent->attachEventToComplete(detail::getSyclObjImpl(Event)); + // Q->registerStreamServiceEvent(detail::getSyclObjImpl(Event)); } } // namespace detail diff --git a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp index 838b60809472c..4b34a1f4d6828 100644 --- a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp +++ b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp @@ -12,6 +12,7 @@ #include #include #include +#include using namespace sycl; @@ -81,8 +82,13 @@ TEST_F(SchedulerTest, StreamInitDependencyOnHost) { DisableCleanupName, "1", detail::SYCLConfig::reset}; + sycl::unittest::PiMock Mock; + sycl::platform Plt = Mock.getPlatform(); + sycl::queue Q(Plt.get_devices()[0]); + std::shared_ptr QImpl = detail::getSyclObjImpl(Q); + // Emulating processing of command group function - MockHandlerStreamInit MockCGH(nullptr, true); + MockHandlerStreamInit MockCGH(QImpl, true); MockCGH.setType(detail::CG::Kernel); auto EmptyKernel = [](sycl::nd_item<1>) {}; @@ -111,7 +117,7 @@ TEST_F(SchedulerTest, StreamInitDependencyOnHost) { static_cast(MainCG.get())->getStreams(); ASSERT_EQ(Streams.size(), 1u) << "Invalid number of stream objects"; - Streams[0]->initStreamHost(nullptr); + Streams[0]->initStreamHost(QImpl); MockScheduler MS; std::vector AuxCmds; From abfc5bfbdf48b8bfe48cfb17e68d9a91bb64ba9e Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Mon, 17 Jun 2024 07:49:32 -0700 Subject: [PATCH 26/58] tiny cleanup Signed-off-by: Tikhomirova, Kseniya --- .../source/detail/scheduler/graph_builder.cpp | 22 +++++++++---------- sycl/source/detail/scheduler/scheduler.hpp | 3 +-- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 6d3fbdd157618..1932f18d697ac 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -868,7 +868,7 @@ void Scheduler::GraphBuilder::markModifiedIfWrite(MemObjRecord *Record, EmptyCommand *Scheduler::GraphBuilder::addEmptyCmd( Command *Cmd, const std::vector &Reqs, Command::BlockReason Reason, - std::vector &ToEnqueue, const bool AddDepsToLeaves) { + std::vector &ToEnqueue) { EmptyCommand *EmptyCmd = new EmptyCommand(); if (!EmptyCmd) @@ -889,19 +889,17 @@ EmptyCommand *Scheduler::GraphBuilder::addEmptyCmd( if (!Reqs.size()) Cmd->addUser(EmptyCmd); - if (AddDepsToLeaves) { - const std::vector &Deps = Cmd->MDeps; - std::vector ToCleanUp; - for (const DepDesc &Dep : Deps) { - const Requirement *Req = Dep.MDepRequirement; - MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj); + const std::vector &Deps = Cmd->MDeps; + std::vector ToCleanUp; + for (const DepDesc &Dep : Deps) { + const Requirement *Req = Dep.MDepRequirement; + MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj); - updateLeaves({Cmd}, Record, Req->MAccessMode, ToCleanUp); - addNodeToLeaves(Record, EmptyCmd, Req->MAccessMode, ToEnqueue); - } - for (Command *Cmd : ToCleanUp) - cleanupCommand(Cmd); + updateLeaves({Cmd}, Record, Req->MAccessMode, ToCleanUp); + addNodeToLeaves(Record, EmptyCmd, Req->MAccessMode, ToEnqueue); } + for (Command *Cmd : ToCleanUp) + cleanupCommand(Cmd); return EmptyCmd; } diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index d3462872c9ddf..4e0bf465d59fd 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -742,8 +742,7 @@ class Scheduler { EmptyCommand *addEmptyCmd(Command *Cmd, const std::vector &Req, Command::BlockReason Reason, - std::vector &ToEnqueue, - const bool AddDepsToLeaves = true); + std::vector &ToEnqueue); void createGraphForCommand(Command *NewCmd, CG &CG, bool isInteropTask, std::vector &Reqs, From 75f6eab8dd7a8f5b008d1b955bad3c3fc36914ba Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Mon, 17 Jun 2024 07:21:30 -0700 Subject: [PATCH 27/58] move stream_impl flush Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/queue_impl.cpp | 19 ++++- sycl/source/detail/queue_impl.hpp | 3 +- sycl/source/detail/scheduler/scheduler.cpp | 11 --- sycl/source/detail/stream_impl.cpp | 83 ++++++------------- sycl/source/detail/stream_impl.hpp | 10 +-- .../scheduler/CommandsWaitForEvents.cpp | 2 +- .../scheduler/StreamInitDependencyOnHost.cpp | 62 -------------- 7 files changed, 49 insertions(+), 141 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 298d4078cc922..af7af19ede120 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -361,8 +361,10 @@ event queue_impl::submit_impl(const std::function &CGF, // Host and interop tasks, however, are not submitted to low-level runtimes // and require separate dependency management. const CG::CGTYPE Type = Handler.getType(); - event Event = detail::createSyclObjFromImpl( - std::make_shared()); + event Event = detail::createSyclObjFromImpl(std::make_shared()); + std::vector Streams; + if (Type == CG::Kernel) + Streams = std::move(Handler.MStreamStorage); if (PostProcess) { bool IsKernel = Type == CG::Kernel; @@ -380,6 +382,19 @@ event queue_impl::submit_impl(const std::function &CGF, finalizeHandler(Handler, Event); addEvent(Event); + + auto EventImpl = detail::getSyclObjImpl(Event); + for (auto &Stream : Streams) { + // We don't want stream flushing to be blocking operation that is why submit a + // host task to print stream buffer. It will fire up as soon as the kernel + // finishes execution. + event FlushEvent = submit_impl([&](handler &ServiceCGH) { + Stream->generateFlushCommand(ServiceCGH); + }, Self, PrimaryQueue, SecondaryQueue, Loc, {}); + EventImpl->attachEventToComplete(detail::getSyclObjImpl(FlushEvent)); + registerStreamServiceEvent(detail::getSyclObjImpl(FlushEvent)); + } + return Event; } diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index c3d0c4c5752f8..e72ded829a798 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -13,10 +13,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include #include @@ -26,7 +28,6 @@ #include #include #include -#include #include #include #include diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index d3fe7b523e689..52eb59b225004 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -99,13 +99,6 @@ EventImplPtr Scheduler::addCG( EventImplPtr NewEvent = nullptr; const CG::CGTYPE Type = CommandGroup->getType(); std::vector AuxiliaryCmds; - std::vector Streams; - - if (Type == CG::Kernel) { - auto *CGExecKernelPtr = static_cast(CommandGroup.get()); - Streams = CGExecKernelPtr->getStreams(); - CGExecKernelPtr->clearStreams(); - } std::vector> AuxiliaryResources; AuxiliaryResources = CommandGroup->getAuxiliaryResources(); CommandGroup->clearAuxiliaryResources(); @@ -143,10 +136,6 @@ EventImplPtr Scheduler::addCG( if (ShouldEnqueue) { enqueueCommandForCG(NewEvent, AuxiliaryCmds); - - for (const auto &StreamImplPtr : Streams) { - StreamImplPtr->flush(NewEvent); - } } if (!AuxiliaryResources.empty()) diff --git a/sycl/source/detail/stream_impl.cpp b/sycl/source/detail/stream_impl.cpp index cb46510551a30..7d926fbdb83dd 100644 --- a/sycl/source/detail/stream_impl.cpp +++ b/sycl/source/detail/stream_impl.cpp @@ -76,65 +76,36 @@ size_t stream_impl::get_size() const { return BufferSize_; } size_t stream_impl::get_max_statement_size() const { return MaxStatementSize_; } -void stream_impl::initStreamHost(QueueImplPtr Queue) { - // Real size of full flush buffer is saved only in buffer_impl field of - // FlushBuf object. - size_t FlushBufSize = getSyclObjImpl(FlushBuf_)->size(); - - auto Q = createSyclObjFromImpl(Queue); - Q.submit([&](handler &cgh) { - auto FlushBufAcc = FlushBuf_.get_access( - cgh, range<1>(1), id<1>(0)); - cgh.host_task([=] { - char *FlushBufPtr = FlushBufAcc.get_pointer(); - std::memset(FlushBufPtr, 0, FlushBufSize); - }); +void stream_impl::generateFlushCommand(handler& cgh) +{ + // Create accessor to the flush buffer even if not using it yet. Otherwise + // kernel will be a leaf for the flush buffer and scheduler will not be able + // to cleanup the kernel. TODO: get rid of finalize method by using host + // accessor to the flush buffer. + host_accessor FlushBuffHostAcc(FlushBuf_, cgh); + host_accessor BufHostAcc (Buf_, cgh, range<1>(BufferSize_), id<1>(OffsetSize)); + + cgh.host_task([=] { + if (!BufHostAcc.empty()) { + // SYCL 2020, 4.16: + // > If the totalBufferSize or workItemBufferSize limits are exceeded, + // > it is implementation-defined whether the streamed characters + // > exceeding the limit are output, or silently ignored/discarded, and + // > if output it is implementation-defined whether those extra + // > characters exceeding the workItemBufferSize limit count toward the + // > totalBufferSize limit. Regardless of this implementation defined + // > behavior of output exceeding the limits, no undefined or erroneous + // > behavior is permitted of an implementation when the limits are + // > exceeded. + // + // Defend against zero-sized buffers (although they'd have no practical + // use). + printf("%s", &(BufHostAcc[0])); + } + fflush(stdout); }); } -void stream_impl::flush(const EventImplPtr &LeadEvent) { - assert(LeadEvent && "LeadEvent is expected to be not nullptr"); - // We don't want stream flushing to be blocking operation that is why submit a - // host task to print stream buffer. It will fire up as soon as the kernel - // finishes execution. - // auto Q = LeadEvent->getSubmittedQueue(); - // event Event = detail::createSyclObjFromImpl(Q).submit([&](handler &cgh) { - // auto BufHostAcc = - // Buf_.get_access( - // cgh, range<1>(BufferSize_), id<1>(OffsetSize)); - // // Create accessor to the flush buffer even if not using it yet. Otherwise - // // kernel will be a leaf for the flush buffer and scheduler will not be able - // // to cleanup the kernel. TODO: get rid of finalize method by using host - // // accessor to the flush buffer. - // auto FlushBufHostAcc = - // FlushBuf_ - // .get_access( - // cgh); - // cgh.host_task([=] { - // if (!BufHostAcc.empty()) { - // // SYCL 2020, 4.16: - // // > If the totalBufferSize or workItemBufferSize limits are exceeded, - // // > it is implementation-defined whether the streamed characters - // // > exceeding the limit are output, or silently ignored/discarded, and - // // > if output it is implementation-defined whether those extra - // // > characters exceeding the workItemBufferSize limit count toward the - // // > totalBufferSize limit. Regardless of this implementation defined - // // > behavior of output exceeding the limits, no undefined or erroneous - // // > behavior is permitted of an implementation when the limits are - // // > exceeded. - // // - // // Defend against zero-sized buffers (although they'd have no practical - // // use). - // printf("%s", &(BufHostAcc[0])); - // } - // fflush(stdout); - // }); - // }); - // LeadEvent->attachEventToComplete(detail::getSyclObjImpl(Event)); - // Q->registerStreamServiceEvent(detail::getSyclObjImpl(Event)); -} - } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/stream_impl.hpp b/sycl/source/detail/stream_impl.hpp index cd3d503b4b894..aacb495537943 100644 --- a/sycl/source/detail/stream_impl.hpp +++ b/sycl/source/detail/stream_impl.hpp @@ -41,14 +41,6 @@ class __SYCL_EXPORT stream_impl { // buffer and offset in the flush buffer GlobalOffsetAccessorT accessGlobalOffset(handler &CGH); - // Initialize flush buffers on host. - void initStreamHost(QueueImplPtr Queue); - - // Enqueue task to copy stream buffer to the host and print the contents - // The host task event is then registered for post processing in the - // LeadEvent as well as in queue LeadEvent associated with. - void flush(const EventImplPtr &LeadEvent); - size_t size() const noexcept; size_t get_work_item_buffer_size() const; @@ -67,6 +59,8 @@ class __SYCL_EXPORT stream_impl { return PropList_.get_property(); } + void generateFlushCommand(handler& cgh); + private: // Size of the stream buffer size_t BufferSize_; diff --git a/sycl/unittests/scheduler/CommandsWaitForEvents.cpp b/sycl/unittests/scheduler/CommandsWaitForEvents.cpp index 499a45d0fe70f..43aa7a88775d7 100644 --- a/sycl/unittests/scheduler/CommandsWaitForEvents.cpp +++ b/sycl/unittests/scheduler/CommandsWaitForEvents.cpp @@ -163,7 +163,7 @@ TEST_F(SchedulerTest, StreamAUXCmdsWait) { auto EventImplProxy = std::static_pointer_cast(EventImpl); - ASSERT_TRUE(EventImplProxy->MPostCompleteEvents.size() == 1) + ASSERT_EQ(EventImplProxy->MPostCompleteEvents.size(), 1) << "Expected 1 post complete event"; Q.wait(); diff --git a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp index 4b34a1f4d6828..d1e7f22aa9485 100644 --- a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp +++ b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp @@ -74,65 +74,3 @@ static bool ValidateDepCommandsTree(const detail::Command *Cmd, return false; } - -TEST_F(SchedulerTest, StreamInitDependencyOnHost) { - // Disable post enqueue cleanup so that it doesn't interfere with dependency - // checks. - unittest::ScopedEnvVar DisabledCleanup{ - DisableCleanupName, "1", - detail::SYCLConfig::reset}; - - sycl::unittest::PiMock Mock; - sycl::platform Plt = Mock.getPlatform(); - sycl::queue Q(Plt.get_devices()[0]); - std::shared_ptr QImpl = detail::getSyclObjImpl(Q); - - // Emulating processing of command group function - MockHandlerStreamInit MockCGH(QImpl, true); - MockCGH.setType(detail::CG::Kernel); - - auto EmptyKernel = [](sycl::nd_item<1>) {}; - MockCGH - .setHostKernel, 1, class Empty>( - EmptyKernel); - MockCGH.setNDRangeDesc( - sycl::nd_range<1>{sycl::range<1>{1}, sycl::range<1>{1}}); - - // Emulating construction of stream object inside command group - detail::StreamImplPtr StreamImpl = - std::make_shared(1024, 200, MockCGH); - detail::GlobalBufAccessorT FlushBufAcc = - StreamImpl->accessGlobalFlushBuf(MockCGH); - MockCGH.addStream(StreamImpl); - - detail::SYCLMemObjI *FlushBufMemObjPtr = - detail::getSyclObjImpl(FlushBufAcc)->MSYCLMemObj; - ASSERT_TRUE(!!FlushBufMemObjPtr) - << "Memory object for stream flush buffer not initialized"; - - std::unique_ptr MainCG = MockCGH.finalize(); - - // Emulate call of Scheduler::addCG - std::vector Streams = - static_cast(MainCG.get())->getStreams(); - ASSERT_EQ(Streams.size(), 1u) << "Invalid number of stream objects"; - - Streams[0]->initStreamHost(QImpl); - - MockScheduler MS; - std::vector AuxCmds; - detail::Command *NewCmd = MS.addCG(std::move(MainCG), nullptr, AuxCmds); - ASSERT_TRUE(!!NewCmd) << "Failed to add command group into scheduler"; - ASSERT_GT(NewCmd->MDeps.size(), 0u) - << "No deps appeared in the new exec kernel command"; - - // Searching in dependencies for CG execution command that initializes flush - // buffer of a stream that is supposed to be used inside NewCmd's CG. - // Tree of dependencies should look like: - // [MAIN_CG] -> [EMPTY_NODE {FlushBufMemObj}] -> [FILL_CG {FlushBufMemObj}] -> - // [[ALLOC_TASK {FlushBufMemObj}] - std::vector DepCmdsTypes({CmdTypeTy::RUN_CG, // FILL_CG - CmdTypeTy::ALLOCA}); - ASSERT_TRUE(ValidateDepCommandsTree(NewCmd, DepCmdsTypes, FlushBufMemObjPtr)) - << "Dependency on stream flush buffer initialization not found"; -} From be12c01ecc837de0ff5f7f3c2f17ca34b03d921d Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 19 Jun 2024 04:44:06 -0700 Subject: [PATCH 28/58] test fix Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/device_impl.cpp | 3 ++ sycl/source/detail/event_impl.cpp | 30 +++++++++---------- sycl/source/detail/image_impl.cpp | 2 ++ .../scheduler/CommandsWaitForEvents.cpp | 2 +- 4 files changed, 21 insertions(+), 16 deletions(-) diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp index 846972254f7d9..e24b6f6f2510e 100644 --- a/sycl/source/detail/device_impl.cpp +++ b/sycl/source/detail/device_impl.cpp @@ -327,6 +327,9 @@ bool device_impl::has(aspect Aspect) const { size_t return_size = 0; switch (Aspect) { + case aspect::host: + //Deprecated + return false; case aspect::cpu: return is_cpu(); case aspect::gpu: diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index 0d2976e7ec271..93dc4b7fca1b1 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -155,15 +155,13 @@ event_impl::event_impl(const QueueImplPtr &Queue) MFallbackProfiling{MIsProfilingEnabled && Queue && Queue->isProfilingFallback()} { if (Queue) this->setContextImpl(Queue->getContextImplPtr()); - if (!Queue) { + else { MState.store(HES_NotComplete); - if (Queue->has_property()) { - MHostProfilingInfo.reset(new HostProfilingInfo()); - if (!MHostProfilingInfo) - throw sycl::exception(sycl::make_error_code(sycl::errc::runtime), - "Out of host memory " + - codeToString(PI_ERROR_OUT_OF_HOST_MEMORY)); - } + MHostProfilingInfo.reset(new HostProfilingInfo()); + if (!MHostProfilingInfo) + throw sycl::exception(sycl::make_error_code(sycl::errc::runtime), + "Out of host memory " + + codeToString(PI_ERROR_OUT_OF_HOST_MEMORY)); return; } MState.store(HES_Complete); @@ -381,13 +379,15 @@ event_impl::get_info() { if (MState == HES_Discarded) return info::event_command_status::ext_oneapi_unknown; - // Command is enqueued and PiEvent is ready - if (MEvent) - return get_event_info( - this->getHandleRef(), this->getPlugin()); - // Command is blocked and not enqueued, PiEvent is not assigned yet - else if (MCommand) - return sycl::info::event_command_status::submitted; + if (!MIsHostEvent) { + // Command is enqueued and PiEvent is ready + if (MEvent) + return get_event_info( + this->getHandleRef(), this->getPlugin()); + // Command is blocked and not enqueued, PiEvent is not assigned yet + else if (MCommand) + return sycl::info::event_command_status::submitted; + } return MState.load() != HES_Complete ? sycl::info::event_command_status::submitted diff --git a/sycl/source/detail/image_impl.cpp b/sycl/source/detail/image_impl.cpp index 0b512ae1aedbe..e5bacd33fc70d 100644 --- a/sycl/source/detail/image_impl.cpp +++ b/sycl/source/detail/image_impl.cpp @@ -471,6 +471,8 @@ bool image_impl::checkImageFormat( } std::vector image_impl::getDevices(const ContextImplPtr Context) { + if (!Context) + return {}; return Context->get_info(); } diff --git a/sycl/unittests/scheduler/CommandsWaitForEvents.cpp b/sycl/unittests/scheduler/CommandsWaitForEvents.cpp index 43aa7a88775d7..daf8599947ad2 100644 --- a/sycl/unittests/scheduler/CommandsWaitForEvents.cpp +++ b/sycl/unittests/scheduler/CommandsWaitForEvents.cpp @@ -163,7 +163,7 @@ TEST_F(SchedulerTest, StreamAUXCmdsWait) { auto EventImplProxy = std::static_pointer_cast(EventImpl); - ASSERT_EQ(EventImplProxy->MPostCompleteEvents.size(), 1) + ASSERT_EQ(EventImplProxy->MPostCompleteEvents.size(), 1u) << "Expected 1 post complete event"; Q.wait(); From e043ee01f185cecac5c0cbd2648853ac0ff4c6db Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 19 Jun 2024 05:35:10 -0700 Subject: [PATCH 29/58] restore & update ABI - not breaking Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/stream_impl.cpp | 9 +++++++++ sycl/source/detail/stream_impl.hpp | 9 +++++++++ sycl/test/abi/sycl_symbols_linux.dump | 17 +++++++++-------- 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/sycl/source/detail/stream_impl.cpp b/sycl/source/detail/stream_impl.cpp index 7d926fbdb83dd..75c80745ec71c 100644 --- a/sycl/source/detail/stream_impl.cpp +++ b/sycl/source/detail/stream_impl.cpp @@ -106,6 +106,15 @@ void stream_impl::generateFlushCommand(handler& cgh) }); } + // ABI break: remove + void stream_impl::initStreamHost(QueueImplPtr ){}; + + // ABI break: remove + void stream_impl::flush(const EventImplPtr &) {}; + + // ABI break: remove + void stream_impl::flush() {}; + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/stream_impl.hpp b/sycl/source/detail/stream_impl.hpp index aacb495537943..4fc1f4b1d5a8a 100644 --- a/sycl/source/detail/stream_impl.hpp +++ b/sycl/source/detail/stream_impl.hpp @@ -41,6 +41,15 @@ class __SYCL_EXPORT stream_impl { // buffer and offset in the flush buffer GlobalOffsetAccessorT accessGlobalOffset(handler &CGH); + // ABI break: remove + void initStreamHost(QueueImplPtr); + + // ABI break: remove + void flush(const EventImplPtr &); + + // ABI break: remove + void flush(); + size_t size() const noexcept; size_t get_work_item_buffer_size() const; diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 0edaaa25b4ba1..c60fdb1318905 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3119,6 +3119,7 @@ _ZN4sycl3_V15queue10mem_adviseEPKvmiRKSt6vectorINS0_5eventESaIS5_EERKNS0_6detail _ZN4sycl3_V15queue10wait_proxyERKNS0_6detail13code_locationE _ZN4sycl3_V15queue11submit_implESt8functionIFvRNS0_7handlerEEERKNS0_6detail13code_locationE _ZN4sycl3_V15queue11submit_implESt8functionIFvRNS0_7handlerEEES1_RKNS0_6detail13code_locationE +_ZN4sycl3_V15queue15ext_oneapi_prodEv _ZN4sycl3_V15queue17discard_or_returnERKNS0_5eventE _ZN4sycl3_V15queue18throw_asynchronousEv _ZN4sycl3_V15queue20memcpyToDeviceGlobalEPvPKvbmmRKSt6vectorINS0_5eventESaIS6_EE @@ -3230,6 +3231,7 @@ _ZN4sycl3_V16detail11stream_impl14initStreamHostESt10shared_ptrINS1_10queue_impl _ZN4sycl3_V16detail11stream_impl15accessGlobalBufERNS0_7handlerE _ZN4sycl3_V16detail11stream_impl18accessGlobalOffsetERNS0_7handlerE _ZN4sycl3_V16detail11stream_impl20accessGlobalFlushBufERNS0_7handlerE +_ZN4sycl3_V16detail11stream_impl20generateFlushCommandERNS0_7handlerE _ZN4sycl3_V16detail11stream_impl5flushERKSt10shared_ptrINS1_10event_implEE _ZN4sycl3_V16detail11stream_impl5flushEv _ZN4sycl3_V16detail11stream_implC1EmmRKNS0_13property_listE @@ -3621,6 +3623,7 @@ _ZN4sycl3_V17handler28memcpyToHostOnlyDeviceGlobalEPKvS3_mbmm _ZN4sycl3_V17handler28setStateExplicitKernelBundleEv _ZN4sycl3_V17handler30memcpyFromHostOnlyDeviceGlobalEPvPKvbmm _ZN4sycl3_V17handler30verifyUsedKernelBundleInternalENS0_6detail11string_viewE +_ZN4sycl3_V17handler32verifyDeviceHasProgressGuaranteeENS0_3ext6oneapi12experimental26forward_progress_guaranteeENS4_15execution_scopeES6_ _ZN4sycl3_V17handler34ext_oneapi_wait_external_semaphoreENS0_3ext6oneapi12experimental24interop_semaphore_handleE _ZN4sycl3_V17handler36ext_oneapi_signal_external_semaphoreENS0_3ext6oneapi12experimental24interop_semaphore_handleE _ZN4sycl3_V17handler6memcpyEPvPKvm @@ -3633,7 +3636,6 @@ _ZN4sycl3_V17handlerC1ESt10shared_ptrINS0_6detail10queue_implEEb _ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_3ext6oneapi12experimental6detail10graph_implEE _ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_6detail10queue_implEES5_S5_b _ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_6detail10queue_implEEb -_ZN4sycl3_V17handler32verifyDeviceHasProgressGuaranteeENS0_3ext6oneapi12experimental26forward_progress_guaranteeENS4_15execution_scopeES6_ _ZN4sycl3_V17samplerC1ENS0_29coordinate_normalization_modeENS0_15addressing_modeENS0_14filtering_modeERKNS0_13property_listE _ZN4sycl3_V17samplerC1EP11_cl_samplerRKNS0_7contextE _ZN4sycl3_V17samplerC2ENS0_29coordinate_normalization_modeENS0_15addressing_modeENS0_14filtering_modeERKNS0_13property_listE @@ -3748,7 +3750,6 @@ _ZNK4sycl3_V15queue12has_propertyINS0_8property5queue16enable_profilingEEEbv _ZNK4sycl3_V15queue12has_propertyINS0_8property5queue4cuda18use_default_streamEEEbv _ZNK4sycl3_V15queue12has_propertyINS0_8property5queue8in_orderEEEbv _ZNK4sycl3_V15queue16ext_oneapi_emptyEv -_ZN4sycl3_V15queue15ext_oneapi_prodEv _ZNK4sycl3_V15queue16get_backend_infoINS0_4info6device15backend_versionEEENS0_6detail20is_backend_info_descIT_E11return_typeEv _ZNK4sycl3_V15queue16get_backend_infoINS0_4info6device7versionEEENS0_6detail20is_backend_info_descIT_E11return_typeEv _ZNK4sycl3_V15queue16get_backend_infoINS0_4info8platform7versionEEENS0_6detail20is_backend_info_descIT_E11return_typeEv @@ -3973,6 +3974,12 @@ _ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device22m _ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device22max_image_linear_widthEEENS0_6detail11ABINeutralTINS9_19is_device_info_descIT_E11return_typeEE4typeEv _ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device23max_image_linear_heightEEENS0_6detail11ABINeutralTINS9_19is_device_info_descIT_E11return_typeEE4typeEv _ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device26max_image_linear_row_pitchEEENS0_6detail11ABINeutralTINS9_19is_device_info_descIT_E11return_typeEE4typeEv +_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31sub_group_progress_capabilitiesILNS5_15execution_scopeE2EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv +_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31sub_group_progress_capabilitiesILNS5_15execution_scopeE3EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv +_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31work_item_progress_capabilitiesILNS5_15execution_scopeE1EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv +_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31work_item_progress_capabilitiesILNS5_15execution_scopeE2EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv +_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31work_item_progress_capabilitiesILNS5_15execution_scopeE3EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv +_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device32work_group_progress_capabilitiesILNS5_15execution_scopeE3EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv _ZNK4sycl3_V16device13get_info_implINS0_3ext8codeplay12experimental4info6device15supports_fusionEEENS0_6detail11ABINeutralTINS9_19is_device_info_descIT_E11return_typeEE4typeEv _ZNK4sycl3_V16device13get_info_implINS0_3ext8codeplay12experimental4info6device28max_registers_per_work_groupEEENS0_6detail11ABINeutralTINS9_19is_device_info_descIT_E11return_typeEE4typeEv _ZNK4sycl3_V16device13get_info_implINS0_4info6device10extensionsEEENS0_6detail11ABINeutralTINS6_19is_device_info_descIT_E11return_typeEE4typeEv @@ -4084,12 +4091,6 @@ _ZNK4sycl3_V16device13get_info_implINS0_4info6device7versionEEENS0_6detail11ABIN _ZNK4sycl3_V16device13get_info_implINS0_4info6device8atomic64EEENS0_6detail11ABINeutralTINS6_19is_device_info_descIT_E11return_typeEE4typeEv _ZNK4sycl3_V16device13get_info_implINS0_4info6device8platformEEENS0_6detail11ABINeutralTINS6_19is_device_info_descIT_E11return_typeEE4typeEv _ZNK4sycl3_V16device13get_info_implINS0_4info6device9vendor_idEEENS0_6detail11ABINeutralTINS6_19is_device_info_descIT_E11return_typeEE4typeEv -_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device32work_group_progress_capabilitiesILNS5_15execution_scopeE3EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv -_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31sub_group_progress_capabilitiesILNS5_15execution_scopeE3EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv -_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31sub_group_progress_capabilitiesILNS5_15execution_scopeE2EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv -_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31work_item_progress_capabilitiesILNS5_15execution_scopeE2EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv -_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31work_item_progress_capabilitiesILNS5_15execution_scopeE3EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv -_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31work_item_progress_capabilitiesILNS5_15execution_scopeE1EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv _ZNK4sycl3_V16device13has_extensionERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE _ZNK4sycl3_V16device14is_acceleratorEv _ZNK4sycl3_V16device16get_backend_infoINS0_4info6device15backend_versionEEENS0_6detail20is_backend_info_descIT_E11return_typeEv From cea7c7271f0172ea8b45db2b3b221d4d5cb11937 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 19 Jun 2024 05:48:29 -0700 Subject: [PATCH 30/58] clang git-clang-format run on changed files Signed-off-by: Tikhomirova, Kseniya --- sycl/source/context.cpp | 4 +- sycl/source/detail/context_impl.cpp | 3 +- sycl/source/detail/device_impl.cpp | 5 +- sycl/source/detail/device_impl.hpp | 12 +- sycl/source/detail/event_impl.cpp | 18 +- sycl/source/detail/memory_manager.cpp | 28 +-- sycl/source/detail/platform_impl.hpp | 8 +- sycl/source/detail/program_impl.cpp | 22 +-- sycl/source/detail/program_impl.hpp | 4 +- sycl/source/detail/queue_impl.cpp | 15 +- sycl/source/detail/queue_impl.hpp | 19 +- sycl/source/detail/scheduler/commands.cpp | 165 +++++++++--------- sycl/source/detail/scheduler/commands.hpp | 12 +- .../source/detail/scheduler/graph_builder.cpp | 59 +++---- sycl/source/detail/scheduler/scheduler.cpp | 8 +- sycl/source/detail/stream_impl.cpp | 21 +-- sycl/source/detail/stream_impl.hpp | 2 +- sycl/source/detail/sycl_mem_obj_t.cpp | 5 +- sycl/source/detail/usm/usm_impl.cpp | 48 ++--- .../scheduler/EnqueueWithDependsOnDeps.cpp | 3 +- sycl/unittests/scheduler/GraphCleanup.cpp | 3 +- sycl/unittests/scheduler/InOrderQueueDeps.cpp | 3 +- .../scheduler/LeafLimitDiffContexts.cpp | 4 +- sycl/unittests/scheduler/LeavesCollection.cpp | 3 +- .../scheduler/LinkedAllocaDependencies.cpp | 3 +- .../scheduler/NoHostUnifiedMemory.cpp | 3 +- sycl/unittests/scheduler/QueueFlushing.cpp | 10 +- .../scheduler/StreamInitDependencyOnHost.cpp | 2 +- 28 files changed, 239 insertions(+), 253 deletions(-) diff --git a/sycl/source/context.cpp b/sycl/source/context.cpp index 70b12836fc297..1261096b82047 100644 --- a/sycl/source/context.cpp +++ b/sycl/source/context.cpp @@ -56,13 +56,13 @@ context::context(const std::vector &DeviceList, throw invalid_parameter_error("DeviceList is empty.", PI_ERROR_INVALID_VALUE); } - + const auto &RefPlatform = detail::getSyclObjImpl(DeviceList[0].get_platform())->getHandleRef(); if (std::any_of(DeviceList.begin(), DeviceList.end(), [&](const device &CurrentDevice) { return (detail::getSyclObjImpl(CurrentDevice.get_platform()) - ->getHandleRef() != RefPlatform); + ->getHandleRef() != RefPlatform); })) throw invalid_parameter_error( "Can't add devices across platforms to a single context.", diff --git a/sycl/source/detail/context_impl.cpp b/sycl/source/detail/context_impl.cpp index 0c79ed2f70462..8ae13b345b250 100644 --- a/sycl/source/detail/context_impl.cpp +++ b/sycl/source/detail/context_impl.cpp @@ -33,8 +33,7 @@ context_impl::context_impl(const device &Device, async_handler AsyncHandler, : MOwnedByRuntime(true), MAsyncHandler(AsyncHandler), MDevices(1, Device), MContext(nullptr), MPlatform(detail::getSyclObjImpl(Device.get_platform())), - MPropList(PropList), - MSupportBufferLocationByDevices(NotChecked) { + MPropList(PropList), MSupportBufferLocationByDevices(NotChecked) { MKernelProgramCache.setContextPtr(this); } diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp index e24b6f6f2510e..ebad36158cfc6 100644 --- a/sycl/source/detail/device_impl.cpp +++ b/sycl/source/detail/device_impl.cpp @@ -34,8 +34,7 @@ device_impl::device_impl(sycl::detail::pi::PiDevice Device, device_impl::device_impl(pi_native_handle InteropDeviceHandle, sycl::detail::pi::PiDevice Device, PlatformImplPtr Platform, const PluginPtr &Plugin) - : MDevice(Device), - MDeviceHostBaseTime(std::make_pair(0, 0)) { + : MDevice(Device), MDeviceHostBaseTime(std::make_pair(0, 0)) { bool InteroperabilityConstructor = false; if (Device == nullptr) { @@ -328,7 +327,7 @@ bool device_impl::has(aspect Aspect) const { switch (Aspect) { case aspect::host: - //Deprecated + // Deprecated return false; case aspect::cpu: return is_cpu(); diff --git a/sycl/source/detail/device_impl.hpp b/sycl/source/detail/device_impl.hpp index 9249bbba59fe8..a3344ecdd3870 100644 --- a/sycl/source/detail/device_impl.hpp +++ b/sycl/source/detail/device_impl.hpp @@ -64,18 +64,14 @@ class device_impl { /// For host device an exception is thrown /// /// \return non-constant reference to PI device - sycl::detail::pi::PiDevice &getHandleRef() { - return MDevice; - } + sycl::detail::pi::PiDevice &getHandleRef() { return MDevice; } /// Get constant reference to PI device /// /// For host device an exception is thrown /// /// \return constant reference to PI device - const sycl::detail::pi::PiDevice &getHandleRef() const { - return MDevice; - } + const sycl::detail::pi::PiDevice &getHandleRef() const { return MDevice; } /// Check if device is a CPU device /// @@ -90,9 +86,7 @@ class device_impl { /// Check if device is an accelerator device /// /// \return true if SYCL device is an accelerator device - bool is_accelerator() const { - return MType == PI_DEVICE_TYPE_ACC; - } + bool is_accelerator() const { return MType == PI_DEVICE_TYPE_ACC; } /// Return device type /// diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index 93dc4b7fca1b1..7d91129f25b51 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -38,8 +38,8 @@ void event_impl::ensureContextInitialized() { return; const device SyclDevice; - this->setContextImpl(detail::queue_impl::getDefaultOrNew( - detail::getSyclObjImpl(SyclDevice))); + this->setContextImpl( + detail::queue_impl::getDefaultOrNew(detail::getSyclObjImpl(SyclDevice))); } event_impl::~event_impl() { @@ -134,8 +134,8 @@ void event_impl::setContextImpl(const ContextImplPtr &Context) { event_impl::event_impl(sycl::detail::pi::PiEvent Event, const context &SyclContext) : MIsContextInitialized(true), MEvent(Event), - MContext(detail::getSyclObjImpl(SyclContext)), - MIsFlushed(true), MState(HES_Complete) { + MContext(detail::getSyclObjImpl(SyclContext)), MIsFlushed(true), + MState(HES_Complete) { sycl::detail::pi::PiContext TempContext; getPlugin()->call( @@ -150,9 +150,9 @@ event_impl::event_impl(sycl::detail::pi::PiEvent Event, } event_impl::event_impl(const QueueImplPtr &Queue) - : MQueue{Queue}, - MIsProfilingEnabled{!Queue || Queue->MIsProfilingEnabled}, - MFallbackProfiling{MIsProfilingEnabled && Queue && Queue->isProfilingFallback()} { + : MQueue{Queue}, MIsProfilingEnabled{!Queue || Queue->MIsProfilingEnabled}, + MFallbackProfiling{MIsProfilingEnabled && Queue && + Queue->isProfilingFallback()} { if (Queue) this->setContextImpl(Queue->getContextImplPtr()); else { @@ -412,7 +412,7 @@ event_impl::get_backend_info() const { } // If the queue has been released, no platform will be associated // so return empty string. - return ""; + return ""; } template <> @@ -571,7 +571,7 @@ bool event_impl::isCompleted() { void event_impl::setCommand(void *Cmd) { MCommand = Cmd; - auto TypedCommand = static_cast(Cmd); + auto TypedCommand = static_cast(Cmd); if (TypedCommand) MIsHostEvent = TypedCommand->getWorkerContext() == nullptr; } diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index 461cf8b85915c..6f30ceef8eb51 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -398,9 +398,11 @@ void *MemoryManager::allocateMemBuffer( sycl::detail::pi::PiEvent &OutEventToWait) { void *MemPtr; if (!TargetContext) - MemPtr = allocateHostMemory(MemObj, UserPtr, HostPtrReadOnly, Size, PropsList); + MemPtr = + allocateHostMemory(MemObj, UserPtr, HostPtrReadOnly, Size, PropsList); else if (UserPtr && InteropContext) - MemPtr = allocateInteropMemObject(TargetContext, UserPtr, InteropEvent, + MemPtr = + allocateInteropMemObject(TargetContext, UserPtr, InteropEvent, InteropContext, PropsList, OutEventToWait); else MemPtr = allocateBufferObject(TargetContext, UserPtr, HostPtrReadOnly, Size, @@ -665,7 +667,8 @@ void copyD2D(SYCLMemObjI *SYCLMemObj, sycl::detail::pi::PiMem SrcMem, sycl::detail::pi::PiEvent &OutEvent, const detail::EventImplPtr &OutEventImpl) { assert(SYCLMemObj && "The SYCLMemObj is nullptr"); - assert(SrcQueue && "Source mem object and target mem object queues are expected to be not nullptr"); + assert(SrcQueue && "Source mem object and target mem object queues are " + "expected to be not nullptr"); const sycl::detail::pi::PiQueue Queue = SrcQueue->getHandleRef(); const PluginPtr &Plugin = SrcQueue->getPlugin(); @@ -778,9 +781,9 @@ void MemoryManager::copy(SYCLMemObjI *SYCLMemObj, void *SrcMem, if (!SrcQueue) { if (!TgtQueue) copyH2H(SYCLMemObj, (char *)SrcMem, nullptr, DimSrc, SrcSize, - SrcAccessRange, SrcOffset, SrcElemSize, (char *)DstMem, - nullptr, DimDst, DstSize, DstAccessRange, DstOffset, - DstElemSize, std::move(DepEvents), OutEvent, OutEventImpl); + SrcAccessRange, SrcOffset, SrcElemSize, (char *)DstMem, nullptr, + DimDst, DstSize, DstAccessRange, DstOffset, DstElemSize, + std::move(DepEvents), OutEvent, OutEventImpl); else copyH2D(SYCLMemObj, (char *)SrcMem, nullptr, DimSrc, SrcSize, SrcAccessRange, SrcOffset, SrcElemSize, @@ -1235,7 +1238,8 @@ memcpyToDeviceGlobalUSM(QueueImplPtr Queue, const std::vector &DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl) { - assert(Queue && "Copy to device global USM must be called with a valid device queue"); + assert(Queue && + "Copy to device global USM must be called with a valid device queue"); // Get or allocate USM memory for the device_global. DeviceGlobalUSMMem &DeviceGlobalUSM = DeviceGlobalEntry->getOrAllocateDeviceGlobalUSM(Queue); @@ -1337,7 +1341,9 @@ static void memcpyToDeviceGlobalDirect( size_t NumBytes, size_t Offset, const void *Src, const std::vector &DepEvents, sycl::detail::pi::PiEvent *OutEvent) { - assert(Queue && "Direct copy to device global must be called with a valid device queue"); + assert( + Queue && + "Direct copy to device global must be called with a valid device queue"); sycl::detail::pi::PiProgram Program = getOrBuildProgramForDeviceGlobal(Queue, DeviceGlobalEntry); const PluginPtr &Plugin = Queue->getPlugin(); @@ -1352,7 +1358,8 @@ static void memcpyFromDeviceGlobalDirect( size_t NumBytes, size_t Offset, void *Dest, const std::vector &DepEvents, sycl::detail::pi::PiEvent *OutEvent) { - assert(Queue && "Direct copy from device global must be called with a valid device queue"); + assert(Queue && "Direct copy from device global must be called with a valid " + "device queue"); sycl::detail::pi::PiProgram Program = getOrBuildProgramForDeviceGlobal(Queue, DeviceGlobalEntry); const PluginPtr &Plugin = Queue->getPlugin(); @@ -1762,7 +1769,8 @@ void MemoryManager::copy_image_bindless( sycl::detail::pi::PiImageRegion CopyExtent, const std::vector &DepEvents, sycl::detail::pi::PiEvent *OutEvent) { - assert(Queue && "Copy image bindless must be called with a valid device queue"); + assert(Queue && + "Copy image bindless must be called with a valid device queue"); assert((Flags == (sycl::detail::pi::PiImageCopyFlags) ext::oneapi::experimental::image_copy_flags::HtoD || Flags == (sycl::detail::pi::PiImageCopyFlags) diff --git a/sycl/source/detail/platform_impl.hpp b/sycl/source/detail/platform_impl.hpp index e13bd0a3a1b31..bc6278d54f32c 100644 --- a/sycl/source/detail/platform_impl.hpp +++ b/sycl/source/detail/platform_impl.hpp @@ -103,9 +103,7 @@ class platform_impl { } /// \return an instance of OpenCL cl_platform_id. - cl_platform_id get() const { - return pi::cast(MPlatform); - } + cl_platform_id get() const { return pi::cast(MPlatform); } /// Returns raw underlying plug-in platform handle. /// @@ -114,9 +112,7 @@ class platform_impl { /// is in use. /// /// \return a raw plug-in platform handle. - const sycl::detail::pi::PiPlatform &getHandleRef() const { - return MPlatform; - } + const sycl::detail::pi::PiPlatform &getHandleRef() const { return MPlatform; } /// Returns all available SYCL platforms in the system. /// diff --git a/sycl/source/detail/program_impl.cpp b/sycl/source/detail/program_impl.cpp index 584b2487f5dee..df95614d872c3 100644 --- a/sycl/source/detail/program_impl.cpp +++ b/sycl/source/detail/program_impl.cpp @@ -220,22 +220,22 @@ void program_impl::compile_with_kernel_name(std::string KernelName, std::string CompileOptions) { std::lock_guard Lock(MMutex); throw_if_state_is_not(program_state::none); - create_pi_program_with_kernel_name( - KernelName, - /*JITCompilationIsRequired=*/(!CompileOptions.empty())); - compile(CompileOptions); + create_pi_program_with_kernel_name( + KernelName, + /*JITCompilationIsRequired=*/(!CompileOptions.empty())); + compile(CompileOptions); MState = program_state::compiled; } void program_impl::link(std::string LinkOptions) { std::lock_guard Lock(MMutex); throw_if_state_is_not(program_state::compiled); - check_device_feature_support(MDevices); - std::vector Devices(get_pi_devices()); - const PluginPtr &Plugin = getPlugin(); - const char *LinkOpts = SYCLConfig::get(); - if (!LinkOpts) { - LinkOpts = LinkOptions.c_str(); + check_device_feature_support(MDevices); + std::vector Devices(get_pi_devices()); + const PluginPtr &Plugin = getPlugin(); + const char *LinkOpts = SYCLConfig::get(); + if (!LinkOpts) { + LinkOpts = LinkOptions.c_str(); } // Plugin resets MProgram with a new pi_program as a result of the call to @@ -251,7 +251,7 @@ void program_impl::link(std::string LinkOptions) { Plugin->checkPiResult(Err); MLinkOptions = LinkOptions; MBuildOptions = LinkOptions; - MState = program_state::linked; + MState = program_state::linked; } bool program_impl::has_kernel(std::string KernelName, diff --git a/sycl/source/detail/program_impl.hpp b/sycl/source/detail/program_impl.hpp index 1fa8767774961..67c02e95734ab 100644 --- a/sycl/source/detail/program_impl.hpp +++ b/sycl/source/detail/program_impl.hpp @@ -216,9 +216,7 @@ class program_impl { } /// \return the Plugin associated with the context of this program. - const PluginPtr &getPlugin() const { - return MContext->getPlugin(); - } + const PluginPtr &getPlugin() const { return MContext->getPlugin(); } ContextImplPtr getContextImplPtr() const { return MContext; } diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index af7af19ede120..83f33688ed0b1 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -361,7 +361,8 @@ event queue_impl::submit_impl(const std::function &CGF, // Host and interop tasks, however, are not submitted to low-level runtimes // and require separate dependency management. const CG::CGTYPE Type = Handler.getType(); - event Event = detail::createSyclObjFromImpl(std::make_shared()); + event Event = detail::createSyclObjFromImpl( + std::make_shared()); std::vector Streams; if (Type == CG::Kernel) Streams = std::move(Handler.MStreamStorage); @@ -385,12 +386,12 @@ event queue_impl::submit_impl(const std::function &CGF, auto EventImpl = detail::getSyclObjImpl(Event); for (auto &Stream : Streams) { - // We don't want stream flushing to be blocking operation that is why submit a - // host task to print stream buffer. It will fire up as soon as the kernel + // We don't want stream flushing to be blocking operation that is why submit + // a host task to print stream buffer. It will fire up as soon as the kernel // finishes execution. - event FlushEvent = submit_impl([&](handler &ServiceCGH) { - Stream->generateFlushCommand(ServiceCGH); - }, Self, PrimaryQueue, SecondaryQueue, Loc, {}); + event FlushEvent = submit_impl( + [&](handler &ServiceCGH) { Stream->generateFlushCommand(ServiceCGH); }, + Self, PrimaryQueue, SecondaryQueue, Loc, {}); EventImpl->attachEventToComplete(detail::getSyclObjImpl(FlushEvent)); registerStreamServiceEvent(detail::getSyclObjImpl(FlushEvent)); } @@ -707,7 +708,7 @@ void queue_impl::revisitUnenqueuedCommandsState( Deps.UnenqueuedCmdEvents.begin(), Deps.UnenqueuedCmdEvents.end(), [](const EventImplPtr &CommandEvent) { return (CommandEvent->isHost() ? CommandEvent->isCompleted() - : CommandEvent->isEnqueued()); + : CommandEvent->isEnqueued()); }), Deps.UnenqueuedCmdEvents.end()); } diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index e72ded829a798..d0a74cc80c793 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -13,8 +13,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -194,14 +194,13 @@ class queue_impl { if (MDevice) { xpti::addMetadata(TEvent, "sycl_device_name", MDevice->getDeviceName()); - xpti::addMetadata( - TEvent, "sycl_device", - reinterpret_cast(MDevice->getHandleRef())); + xpti::addMetadata(TEvent, "sycl_device", + reinterpret_cast(MDevice->getHandleRef())); } xpti::addMetadata(TEvent, "is_inorder", MIsInorder); xpti::addMetadata(TEvent, "queue_id", MQueueID); xpti::addMetadata(TEvent, "queue_handle", - reinterpret_cast(getHandleRef())); + reinterpret_cast(getHandleRef())); }); // Also publish to TLS xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, MQueueID); @@ -257,9 +256,8 @@ class queue_impl { if (MDevice) { xpti::addMetadata(TEvent, "sycl_device_name", MDevice->getDeviceName()); - xpti::addMetadata( - TEvent, "sycl_device", - reinterpret_cast(MDevice->getHandleRef())); + xpti::addMetadata(TEvent, "sycl_device", + reinterpret_cast(MDevice->getHandleRef())); } xpti::addMetadata(TEvent, "is_inorder", MIsInorder); xpti::addMetadata(TEvent, "queue_id", MQueueID); @@ -751,9 +749,8 @@ class queue_impl { // tasks and host tasks is applicable for out of order queues only. Not neede // for in order ones. void revisitUnenqueuedCommandsState(const EventImplPtr &CompletedHostTask); - - static ContextImplPtr getContext(const QueueImplPtr& Queue) - { + + static ContextImplPtr getContext(const QueueImplPtr &Queue) { return Queue ? Queue->getContextImplPtr() : nullptr; } diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index c751cf7438ae7..3d51fe7a1c12f 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -90,21 +90,19 @@ static std::string deviceToString(device Device) { return "UNKNOWN"; } -static void addDeviceMetadata(xpti_td* TraceEvent, const QueueImplPtr& Queue) -{ - xpti::addMetadata(TraceEvent, "sycl_device", - Queue ? deviceToID(Queue->get_device()) : 0); - xpti::addMetadata(TraceEvent, "sycl_device_type", - Queue ? deviceToString(Queue->get_device()) : "host"); - if (Queue) - xpti::addMetadata(TraceEvent, "sycl_device_name", +static void addDeviceMetadata(xpti_td *TraceEvent, const QueueImplPtr &Queue) { + xpti::addMetadata(TraceEvent, "sycl_device", + Queue ? deviceToID(Queue->get_device()) : 0); + xpti::addMetadata(TraceEvent, "sycl_device_type", + Queue ? deviceToString(Queue->get_device()) : "host"); + if (Queue) + xpti::addMetadata(TraceEvent, "sycl_device_name", getSyclObjImpl(Queue->get_device())->getDeviceName()); } #endif -static ContextImplPtr getContext(const QueueImplPtr& Queue) -{ +static ContextImplPtr getContext(const QueueImplPtr &Queue) { if (Queue) return Queue->getContextImplPtr(); return nullptr; @@ -350,10 +348,12 @@ class DispatchHostTask { PluginWithEvents.first->call(RawEvents.size(), RawEvents.data()); } catch (const sycl::exception &E) { - MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(std::current_exception()); + MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException( + std::current_exception()); return (pi_result)E.get_cl_code(); } catch (...) { - MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(std::current_exception()); + MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException( + std::current_exception()); return PI_ERROR_UNKNOWN; } } @@ -404,7 +404,8 @@ class DispatchHostTask { try { // we're ready to call the user-defined lambda now if (HostTask.MHostTask->isInteropTask()) { - assert(HostTask.MQueue && "Submitted queue for host task must be device queue"); + assert(HostTask.MQueue && + "Submitted queue for host task must be device queue"); interop_handle IH{MReqToMem, HostTask.MQueue, HostTask.MQueue->getDeviceImplPtr(), HostTask.MQueue->getContextImplPtr()}; @@ -431,7 +432,8 @@ class DispatchHostTask { } } #endif - MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(CurrentException); + MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException( + CurrentException); } HostTask.MHostTask.reset(); @@ -448,7 +450,8 @@ class DispatchHostTask { Scheduler::getInstance().NotifyHostTaskCompletion(MThisCmd); } catch (...) { auto CurrentException = std::current_exception(); - MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(CurrentException); + MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException( + CurrentException); } } }; @@ -461,13 +464,13 @@ void Command::waitForPreparedHostEvents() const { void Command::waitForEvents(QueueImplPtr Queue, std::vector &EventImpls, sycl::detail::pi::PiEvent &Event) { - #ifndef NDEBUG - for (const EventImplPtr &Event : EventImpls) - assert(!Event->isHost() && - "Only non-host events are expected to be waited for here"); +#ifndef NDEBUG + for (const EventImplPtr &Event : EventImpls) + assert(!Event->isHost() && + "Only non-host events are expected to be waited for here"); #endif if (!EventImpls.empty()) { - if (!Queue) { + if (!Queue) { // Host queue can wait for events from different contexts, i.e. it may // contain events with different contexts in its MPreparedDepsEvents. // OpenCL 2.1 spec says that clWaitForEvents will return @@ -507,7 +510,7 @@ void Command::waitForEvents(QueueImplPtr Queue, MEvent->setHostEnqueueTime(); Plugin->call( Queue->getHandleRef(), RawEvents.size(), &RawEvents[0], &Event); - } + } } } @@ -716,7 +719,8 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, // 1. Non-host events can be ignored if they are not fully initialized. // 2. Some types of commands do not produce PI events after they are - // enqueued (e.g. alloca). Note that we can't check the pi event to make that distinction since the command might still be unenqueued at this point. + // enqueued (e.g. alloca). Note that we can't check the pi event to make that + // distinction since the command might still be unenqueued at this point. bool PiEventExpected = (!DepEvent->isHost() && DepEvent->isInitialized()); if (auto *DepCmd = static_cast(DepEvent->getCommand())) PiEventExpected &= DepCmd->producesPiEvent(); @@ -732,7 +736,7 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, ContextImplPtr DepEventContext = DepEvent->getContextImpl(); // If contexts don't match we'll connect them using host task - if (DepEventContext != WorkerContext && WorkerContext){ + if (DepEventContext != WorkerContext && WorkerContext) { Scheduler::GraphBuilder &GB = Scheduler::getInstance().MGraphBuilder; ConnectionCmd = GB.connectDepEvent(this, DepEvent, Dep, ToCleanUp); } else @@ -1006,7 +1010,7 @@ void AllocaCommandBase::emitInstrumentationData() { // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue ? MQueue->getQueueID() : 0); + MQueue ? MQueue->getQueueID() : 0); } #endif } @@ -1066,9 +1070,9 @@ pi_int32 AllocaCommand::enqueueImp() { } // TODO: Check if it is correct to use std::move on stack variable and // delete it RawEvents below. - MMemAllocation = MemoryManager::allocate( - getContext(MQueue), getSYCLMemObj(), MInitFromUserData, HostPtr, - std::move(EventImpls), Event); + MMemAllocation = MemoryManager::allocate(getContext(MQueue), getSYCLMemObj(), + MInitFromUserData, HostPtr, + std::move(EventImpls), Event); return PI_SUCCESS; } @@ -1077,7 +1081,8 @@ void AllocaCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "ALLOCA ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; + Stream << "ALLOCA ON " + << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n"; Stream << " Link : " << this->MLinkedAllocaCmd << "\\n"; Stream << "\"];" << std::endl; @@ -1163,8 +1168,8 @@ void AllocaSubBufCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "ALLOCA SUB BUF ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") - << "\\n"; + Stream << "ALLOCA SUB BUF ON " + << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n"; Stream << " Offset : " << this->MRequirement.MOffsetInBytes << "\\n"; Stream << " Access range : " << this->MRequirement.MAccessRange[0] << "\\n"; @@ -1266,9 +1271,9 @@ pi_int32 ReleaseCommand::enqueueImp() { if (SkipRelease) Command::waitForEvents(MQueue, EventImpls, Event); else { - MemoryManager::release( - getContext(MQueue), MAllocaCmd->getSYCLMemObj(), - MAllocaCmd->getMemAllocation(), std::move(EventImpls), Event); + MemoryManager::release(getContext(MQueue), MAllocaCmd->getSYCLMemObj(), + MAllocaCmd->getMemAllocation(), + std::move(EventImpls), Event); } return PI_SUCCESS; } @@ -1277,7 +1282,8 @@ void ReleaseCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FF827A\", label=\""; Stream << "ID = " << this << " ; "; - Stream << "RELEASE ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; + Stream << "RELEASE ON " + << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; Stream << " Alloca : " << MAllocaCmd << "\\n"; Stream << " MemObj : " << MAllocaCmd->getSYCLMemObj() << "\\n"; Stream << "\"];" << std::endl; @@ -1347,7 +1353,8 @@ void MapMemObject::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#77AFFF\", label=\""; Stream << "ID = " << this << " ; "; - Stream << "MAP ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; + Stream << "MAP ON " + << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; Stream << "\"];" << std::endl; @@ -1406,8 +1413,8 @@ bool UnMapMemObject::producesPiEvent() const { // so the execution of kernel B starts only on step 4. This workaround // restores the old behavior in this case until this is resolved. return MQueue && (MQueue->getDeviceImplPtr()->getBackend() != - backend::ext_oneapi_level_zero || - MEvent->getHandleRef() != nullptr); + backend::ext_oneapi_level_zero || + MEvent->getHandleRef() != nullptr); } pi_int32 UnMapMemObject::enqueueImp() { @@ -1428,7 +1435,8 @@ void UnMapMemObject::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#EBC40F\", label=\""; Stream << "ID = " << this << " ; "; - Stream << "UNMAP ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; + Stream << "UNMAP ON " + << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; Stream << "\"];" << std::endl; @@ -1476,13 +1484,12 @@ void MemCpyCommand::emitInstrumentationData() { reinterpret_cast(MAddress)); xpti::addMetadata(CmdTraceEvent, "copy_from", MSrcQueue ? deviceToID(MSrcQueue->get_device()) : 0); - xpti::addMetadata( - CmdTraceEvent, "copy_to", - MQueue ? deviceToID(MQueue->get_device()): 0); + xpti::addMetadata(CmdTraceEvent, "copy_to", + MQueue ? deviceToID(MQueue->get_device()) : 0); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue ? MQueue->getQueueID() : 0); + MQueue ? MQueue->getQueueID() : 0); makeTraceEventEpilog(); } #endif @@ -1539,11 +1546,10 @@ void MemCpyCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#C7EB15\" label=\""; Stream << "ID = " << this << " ; "; - Stream << "MEMCPY ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; - Stream << "From: " << MSrcAllocaCmd << " is host: " << !MSrcQueue - << "\\n"; - Stream << "To: " << MDstAllocaCmd << " is host: " << !MQueue - << "\\n"; + Stream << "MEMCPY ON " + << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; + Stream << "From: " << MSrcAllocaCmd << " is host: " << !MSrcQueue << "\\n"; + Stream << "To: " << MDstAllocaCmd << " is host: " << !MQueue << "\\n"; Stream << "\"];" << std::endl; @@ -1597,7 +1603,8 @@ void UpdateHostRequirementCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#f1337f\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "UPDATE REQ ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; + Stream << "UPDATE REQ ON " + << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; bool IsReqOnBuffer = MDstReq.MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer; Stream << "TYPE: " << (IsReqOnBuffer ? "Buffer" : "Image") << "\\n"; @@ -1649,14 +1656,13 @@ void MemCpyCommandHost::emitInstrumentationData() { xpti::addMetadata(CmdTraceEvent, "memory_object", reinterpret_cast(MAddress)); xpti::addMetadata(CmdTraceEvent, "copy_from", - MSrcQueue ? deviceToID(MSrcQueue->get_device()) : 0); - xpti::addMetadata( - CmdTraceEvent, "copy_to", - MQueue ? deviceToID(MQueue->get_device()) : 0); + MSrcQueue ? deviceToID(MSrcQueue->get_device()) : 0); + xpti::addMetadata(CmdTraceEvent, "copy_to", + MQueue ? deviceToID(MQueue->get_device()) : 0); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue ? MQueue->getQueueID() : 0); + MQueue ? MQueue->getQueueID() : 0); makeTraceEventEpilog(); } #endif @@ -1696,8 +1702,7 @@ pi_int32 MemCpyCommandHost::enqueueImp() { return PI_SUCCESS; } -EmptyCommand::EmptyCommand() - : Command(CommandType::EMPTY_TASK, nullptr) { +EmptyCommand::EmptyCommand() : Command(CommandType::EMPTY_TASK, nullptr) { emitInstrumentationDataProxy(); } @@ -1746,7 +1751,7 @@ void EmptyCommand::emitInstrumentationData() { // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue ? MQueue->getQueueID() : 0); + MQueue ? MQueue->getQueueID() : 0); makeTraceEventEpilog(); } #endif @@ -1775,7 +1780,8 @@ void MemCpyCommandHost::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#B6A2EB\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "MEMCPY HOST ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; + Stream << "MEMCPY HOST ON " + << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; Stream << "\"];" << std::endl; @@ -1814,7 +1820,7 @@ void UpdateHostRequirementCommand::emitInstrumentationData() { // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue ? MQueue->getQueueID() : 0); + MQueue ? MQueue->getQueueID() : 0); makeTraceEventEpilog(); } #endif @@ -2082,7 +2088,7 @@ std::pair emitKernelInstrumentationData( if (CmdTraceEvent) { // Stash the queue_id mutable metadata in TLS xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - Queue ? Queue->getQueueID() : 0); + Queue ? Queue->getQueueID() : 0); instrumentationAddExtraKernelMetadata(CmdTraceEvent, NDRDesc, KernelBundleImplPtr, SyclKernelName, @@ -2128,7 +2134,7 @@ void ExecCGCommand::emitInstrumentationData() { if (CmdTraceEvent) { xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue ? MQueue->getQueueID() : 0); + MQueue ? MQueue->getQueueID() : 0); MTraceEvent = static_cast(CmdTraceEvent); if (MCommandGroup->getType() == detail::CG::Kernel) { auto KernelCG = @@ -2151,7 +2157,8 @@ void ExecCGCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "EXEC CG ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; + Stream << "EXEC CG ON " + << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; switch (MCommandGroup->getType()) { case detail::CG::Kernel: { @@ -2247,8 +2254,7 @@ void SetArgBasedOnType( const PluginPtr &Plugin, sycl::detail::pi::PiKernel Kernel, const std::shared_ptr &DeviceImageImpl, const std::function &getMemAllocationFunc, - const sycl::context &Context, detail::ArgDesc &Arg, - size_t NextTrueIndex) { + const sycl::context &Context, detail::ArgDesc &Arg, size_t NextTrueIndex) { switch (Arg.MType) { case kernel_param_kind_t::kind_stream: break; @@ -2338,8 +2344,7 @@ static pi_result SetKernelParamsAndLaunch( auto setFunc = [&Plugin, Kernel, &DeviceImageImpl, &getMemAllocationFunc, &Queue](detail::ArgDesc &Arg, size_t NextTrueIndex) { SetArgBasedOnType(Plugin, Kernel, DeviceImageImpl, getMemAllocationFunc, - Queue->get_context(), Arg, - NextTrueIndex); + Queue->get_context(), Arg, NextTrueIndex); }; applyFuncOnFilteredArgs(EliminatedArgMask, Args, setFunc); @@ -2639,7 +2644,8 @@ enqueueReadWriteHostPipe(const QueueImplPtr &Queue, const std::string &PipeName, bool blocking, void *ptr, size_t size, std::vector &RawEvents, const detail::EventImplPtr &OutEventImpl, bool read) { - assert(Queue && "Queue with submitted read write host pipe could not be on host"); + assert(Queue && + "Queue with submitted read write host pipe could not be on host"); detail::HostPipeMapEntry *hostPipeEntry = ProgramManager::getInstance().getHostPipeEntry(PipeName); @@ -2856,7 +2862,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { flushCrossQueueDeps(EventImpls, MWorkerQueue); bool DiscardPiEvent = MQueue && MQueue->supportsDiscardingPiEvents() && - (MCommandGroup->getRequirements().size() == 0); + (MCommandGroup->getRequirements().size() == 0); sycl::detail::pi::PiEvent *Event = DiscardPiEvent ? nullptr : &MEvent->getHandleRef(); detail::EventImplPtr EventImpl = DiscardPiEvent ? nullptr : MEvent; @@ -2876,10 +2882,9 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { MemoryManager::copy( AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(), MQueue, Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset, - Req->MElemSize, Copy->getDst(), - nullptr, Req->MDims, - Req->MAccessRange, Req->MAccessRange, /*DstOffset=*/{0, 0, 0}, - Req->MElemSize, std::move(RawEvents), MEvent->getHandleRef(), MEvent); + Req->MElemSize, Copy->getDst(), nullptr, Req->MDims, Req->MAccessRange, + Req->MAccessRange, /*DstOffset=*/{0, 0, 0}, Req->MElemSize, + std::move(RawEvents), MEvent->getHandleRef(), MEvent); return PI_SUCCESS; } @@ -2889,8 +2894,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { AllocaCommandBase *AllocaCmd = getAllocaForReq(Req); MemoryManager::copy( - AllocaCmd->getSYCLMemObj(), Copy->getSrc(), - nullptr, Req->MDims, + AllocaCmd->getSYCLMemObj(), Copy->getSrc(), nullptr, Req->MDims, Req->MAccessRange, Req->MAccessRange, /*SrcOffset*/ {0, 0, 0}, Req->MElemSize, AllocaCmd->getMemAllocation(), MQueue, Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset, @@ -2937,7 +2941,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { std::vector &Args = ExecKernel->MArgs; if (MQueue->getDeviceImplPtr()->getBackend() == - backend::ext_intel_esimd_emulator) { + backend::ext_intel_esimd_emulator) { for (ArgDesc &Arg : Args) if (kernel_param_kind_t::kind_accessor == Arg.MType) { Requirement *Req = (Requirement *)(Arg.MPtr); @@ -2959,7 +2963,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { reinterpret_cast(ExecKernel->MHostKernel->getPtr()), NDRDesc.Dims, &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0], &NDRDesc.LocalSize[0], 0, nullptr, nullptr); - return PI_SUCCESS; + return PI_SUCCESS; } auto getMemAllocationFunc = [this](Requirement *Req) { @@ -3119,7 +3123,8 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { return PI_SUCCESS; } case CG::CGTYPE::BarrierWaitlist: { - assert(MQueue && "Device queue must be present for barrier with wait list command"); + assert(MQueue && + "Device queue must be present for barrier with wait list command"); CGBarrier *Barrier = static_cast(MCommandGroup.get()); std::vector Events = Barrier->MEventsWaitWithBarrier; std::vector PiEvents = @@ -3224,7 +3229,8 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { return PI_SUCCESS; } case CG::CGTYPE::SemaphoreSignal: { - assert(MQueue && "Device queue must be present for semaphore signal command"); + assert(MQueue && + "Device queue must be present for semaphore signal command"); CGSemaphoreSignal *SemSignal = (CGSemaphoreSignal *)MCommandGroup.get(); const detail::PluginPtr &Plugin = MQueue->getPlugin(); @@ -3348,7 +3354,7 @@ void KernelFusionCommand::emitInstrumentationData() { // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue ? MQueue->getQueueID() : 0); + MQueue ? MQueue->getQueueID() : 0); xptiNotifySubscribers(MStreamID, NotificationTraceType, detail::GSYCLGraphEvent, static_cast(MTraceEvent), MInstanceID, @@ -3362,7 +3368,8 @@ void KernelFusionCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "KERNEL FUSION on " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n" + Stream << "KERNEL FUSION on " + << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n" << "FUSION LIST: {"; bool Initial = true; for (auto *Cmd : MFusionList) { diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 628ccdf2593da..63fb4853d88e4 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -373,10 +373,11 @@ class Command { std::string MSubmissionFunctionName; // This flag allows to control whether event should be set complete - // after successfull enqueue of command. Event is considered as "host" event if - // there is no backend representation of event (i.e. getHandleRef() return reference to nullptr value). - // By default the flag is set to true due to most of host operations are - // synchronous. The only asynchronous operation currently is host-task. + // after successfull enqueue of command. Event is considered as "host" event + // if there is no backend representation of event (i.e. getHandleRef() return + // reference to nullptr value). By default the flag is set to true due to most + // of host operations are synchronous. The only asynchronous operation + // currently is host-task. bool MShouldCompleteEventIfPossible = true; /// Indicates that the node will be freed by graph cleanup. Such nodes should @@ -792,8 +793,7 @@ void SetArgBasedOnType( const detail::plugin &Plugin, sycl::detail::pi::PiKernel Kernel, const std::shared_ptr &DeviceImageImpl, const std::function &getMemAllocationFunc, - const sycl::context &Context, detail::ArgDesc &Arg, - size_t NextTrueIndex); + const sycl::context &Context, detail::ArgDesc &Arg, size_t NextTrueIndex); void applyFuncOnFilteredArgs( const KernelArgMask *EliminatedArgMask, std::vector &Args, diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 2919932c4e788..2ac97baefb543 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -54,9 +54,10 @@ static bool IsSuitableSubReq(const Requirement *Req) { return Req->MIsSubBuffer; } -static bool isOnSameContext(const ContextImplPtr Context, const QueueImplPtr& Queue) -{ - // Covers case for host usage (nullptr == nullptr) and existing device contexts comparison. +static bool isOnSameContext(const ContextImplPtr Context, + const QueueImplPtr &Queue) { + // Covers case for host usage (nullptr == nullptr) and existing device + // contexts comparison. return Context == queue_impl::getContext(Queue); } @@ -289,8 +290,7 @@ UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd( MemObjRecord *Record, Requirement *Req, const QueueImplPtr &Queue, std::vector &ToEnqueue) { auto Context = queue_impl::getContext(Queue); - AllocaCommandBase *AllocaCmd = - findAllocaForReq(Record, Req, Context); + AllocaCommandBase *AllocaCmd = findAllocaForReq(Record, Req, Context); assert(AllocaCmd && "There must be alloca for requirement!"); UpdateHostRequirementCommand *UpdateCommand = new UpdateHostRequirementCommand(Queue, *Req, AllocaCmd, &Req->MData); @@ -298,8 +298,7 @@ UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd( // dependencies become invalid if requirement is stored by pointer. const Requirement *StoredReq = UpdateCommand->getRequirement(); - std::set Deps = - findDepsForReq(Record, Req, Context); + std::set Deps = findDepsForReq(Record, Req, Context); std::vector ToCleanUp; for (Command *Dep : Deps) { Command *ConnCmd = @@ -353,8 +352,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove( throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY); auto Context = queue_impl::getContext(Queue); - std::set Deps = - findDepsForReq(Record, Req, Context); + std::set Deps = findDepsForReq(Record, Req, Context); Deps.insert(AllocaCmdDst); // Get parent allocation of sub buffer to perform full copy of whole buffer if (IsSuitableSubReq(Req)) { @@ -434,8 +432,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove( Command *Scheduler::GraphBuilder::remapMemoryObject( MemObjRecord *Record, Requirement *Req, AllocaCommandBase *HostAllocaCmd, std::vector &ToEnqueue) { - assert(!HostAllocaCmd->getQueue() && - "Host alloca command expected"); + assert(!HostAllocaCmd->getQueue() && "Host alloca command expected"); assert(HostAllocaCmd->MIsActive && "Active alloca command expected"); AllocaCommandBase *LinkedAllocaCmd = HostAllocaCmd->MLinkedAllocaCmd; @@ -490,8 +487,7 @@ Scheduler::GraphBuilder::addCopyBack(Requirement *Req, if (nullptr == Record || !Record->MMemModified) return nullptr; - std::set Deps = - findDepsForReq(Record, Req, nullptr); + std::set Deps = findDepsForReq(Record, Req, nullptr); AllocaCommandBase *SrcAllocaCmd = findAllocaForReq(Record, Req, Record->MCurContext); @@ -531,7 +527,8 @@ Scheduler::GraphBuilder::addHostAccessor(Requirement *Req, auto SYCLMemObj = static_cast(Req->MSYCLMemObj); SYCLMemObj->handleWriteAccessorCreation(); } - // Host accessor is not attached to any queue so no QueueImplPtr object to be sent to getOrInsertMemObjRecord. + // Host accessor is not attached to any queue so no QueueImplPtr object to be + // sent to getOrInsertMemObjRecord. MemObjRecord *Record = getOrInsertMemObjRecord(nullptr, Req); if (MPrintOptionsArray[BeforeAddHostAcc]) printGraphAsDot("before_addHostAccessor"); @@ -556,8 +553,8 @@ Scheduler::GraphBuilder::addHostAccessor(Requirement *Req, insertUpdateHostReqCmd(Record, Req, nullptr, ToEnqueue); // Need empty command to be blocked until host accessor is destructed - EmptyCommand *EmptyCmd = - addEmptyCmd(UpdateHostAccCmd, {Req}, Command::BlockReason::HostAccessor, ToEnqueue); + EmptyCommand *EmptyCmd = addEmptyCmd( + UpdateHostAccCmd, {Req}, Command::BlockReason::HostAccessor, ToEnqueue); Req->MBlockedCmd = EmptyCmd; @@ -621,8 +618,7 @@ Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record, CanBypassDep |= !doOverlap(Dep.MDepRequirement, Req); // Going through copying memory between contexts is not supported. - if (Dep.MDepCommand) - { + if (Dep.MDepCommand) { auto DepQueue = Dep.MDepCommand->getQueue(); CanBypassDep &= isOnSameContext(Context, DepQueue); } @@ -686,7 +682,8 @@ static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx) { if (std::strcmp(HUMConfig, "1") == 0) return true; } - // host task & host accessor is covered with no device context but provide required support. + // host task & host accessor is covered with no device context but provide + // required support. if (Ctx == nullptr) return true; @@ -705,8 +702,8 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( MemObjRecord *Record, const Requirement *Req, const QueueImplPtr &Queue, std::vector &ToEnqueue) { auto Context = queue_impl::getContext(Queue); - AllocaCommandBase *AllocaCmd = findAllocaForReq( - Record, Req, Context, /*AllowConst=*/false); + AllocaCommandBase *AllocaCmd = + findAllocaForReq(Record, Req, Context, /*AllowConst=*/false); if (!AllocaCmd) { std::vector ToCleanUp; @@ -736,8 +733,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( // TODO the case where the first alloca is made with a discard mode and // the user pointer is read-only is still not handled: it leads to // unnecessary copy on devices with unified host memory support. - const bool HostUnifiedMemory = - checkHostUnifiedMemory(Context); + const bool HostUnifiedMemory = checkHostUnifiedMemory(Context); SYCLMemObjI *MemObj = Req->MSYCLMemObj; const bool InitFromUserData = Record->MAllocaCommands.empty() && (HostUnifiedMemory || MemObj->isInterop()); @@ -828,10 +824,9 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( AllocaCmd->MIsActive = false; } else { LinkedAllocaCmd->MIsActive = false; - Record->MCurContext =Context; + Record->MCurContext = Context; - std::set Deps = - findDepsForReq(Record, Req, Context); + std::set Deps = findDepsForReq(Record, Req, Context); for (Command *Dep : Deps) { Command *ConnCmd = AllocaCmd->addDep( DepDesc{Dep, Req, LinkedAllocaCmd}, ToCleanUp); @@ -871,8 +866,7 @@ void Scheduler::GraphBuilder::markModifiedIfWrite(MemObjRecord *Record, EmptyCommand *Scheduler::GraphBuilder::addEmptyCmd( Command *Cmd, const std::vector &Reqs, - Command::BlockReason Reason, - std::vector &ToEnqueue) { + Command::BlockReason Reason, std::vector &ToEnqueue) { EmptyCommand *EmptyCmd = new EmptyCommand(); if (!EmptyCmd) @@ -1343,8 +1337,7 @@ Command *Scheduler::GraphBuilder::connectDepEvent( /* DepEvents = */ {DepEvent}), CG::CodeplayHostTask, /* Payload */ {})); - ConnectCmd = new ExecCGCommand( - std::move(ConnectCG), nullptr); + ConnectCmd = new ExecCGCommand(std::move(ConnectCG), nullptr); } catch (const std::bad_alloc &) { throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY); } @@ -1719,13 +1712,11 @@ Command *Scheduler::GraphBuilder::addCommandGraphUpdate( NeedMemMoveToHost = true; if (NeedMemMoveToHost) - insertMemoryMove(Record, Req, - nullptr, - ToEnqueue); + insertMemoryMove(Record, Req, nullptr, ToEnqueue); insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue); } std::set Deps = - findDepsForReq(Record, Req, queue_impl::getContext(Queue)); + findDepsForReq(Record, Req, queue_impl::getContext(Queue)); for (Command *Dep : Deps) { if (Dep != NewCmd.get()) { diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index 52eb59b225004..4d26c2a822457 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -110,13 +110,13 @@ EventImplPtr Scheduler::addCG( Command *NewCmd = nullptr; switch (Type) { case CG::UpdateHost: - NewCmd = MGraphBuilder.addCGUpdateHost(std::move(CommandGroup), - AuxiliaryCmds); + NewCmd = + MGraphBuilder.addCGUpdateHost(std::move(CommandGroup), AuxiliaryCmds); NewEvent = NewCmd->getEvent(); break; case CG::CodeplayHostTask: { - auto Result = MGraphBuilder.addCG(std::move(CommandGroup), - nullptr, AuxiliaryCmds); + auto Result = + MGraphBuilder.addCG(std::move(CommandGroup), nullptr, AuxiliaryCmds); NewCmd = Result.NewCmd; NewEvent = Result.NewEvent; ShouldEnqueue = Result.ShouldEnqueue; diff --git a/sycl/source/detail/stream_impl.cpp b/sycl/source/detail/stream_impl.cpp index 75c80745ec71c..7e81e964bdc17 100644 --- a/sycl/source/detail/stream_impl.cpp +++ b/sycl/source/detail/stream_impl.cpp @@ -76,14 +76,15 @@ size_t stream_impl::get_size() const { return BufferSize_; } size_t stream_impl::get_max_statement_size() const { return MaxStatementSize_; } -void stream_impl::generateFlushCommand(handler& cgh) -{ +void stream_impl::generateFlushCommand(handler &cgh) { // Create accessor to the flush buffer even if not using it yet. Otherwise // kernel will be a leaf for the flush buffer and scheduler will not be able // to cleanup the kernel. TODO: get rid of finalize method by using host // accessor to the flush buffer. - host_accessor FlushBuffHostAcc(FlushBuf_, cgh); - host_accessor BufHostAcc (Buf_, cgh, range<1>(BufferSize_), id<1>(OffsetSize)); + host_accessor FlushBuffHostAcc(FlushBuf_, + cgh); + host_accessor BufHostAcc( + Buf_, cgh, range<1>(BufferSize_), id<1>(OffsetSize)); cgh.host_task([=] { if (!BufHostAcc.empty()) { @@ -106,14 +107,14 @@ void stream_impl::generateFlushCommand(handler& cgh) }); } - // ABI break: remove - void stream_impl::initStreamHost(QueueImplPtr ){}; +// ABI break: remove +void stream_impl::initStreamHost(QueueImplPtr){}; - // ABI break: remove - void stream_impl::flush(const EventImplPtr &) {}; +// ABI break: remove +void stream_impl::flush(const EventImplPtr &) {}; - // ABI break: remove - void stream_impl::flush() {}; +// ABI break: remove +void stream_impl::flush() {}; } // namespace detail } // namespace _V1 diff --git a/sycl/source/detail/stream_impl.hpp b/sycl/source/detail/stream_impl.hpp index 4fc1f4b1d5a8a..670931c815185 100644 --- a/sycl/source/detail/stream_impl.hpp +++ b/sycl/source/detail/stream_impl.hpp @@ -68,7 +68,7 @@ class __SYCL_EXPORT stream_impl { return PropList_.get_property(); } - void generateFlushCommand(handler& cgh); + void generateFlushCommand(handler &cgh); private: // Size of the stream buffer diff --git a/sycl/source/detail/sycl_mem_obj_t.cpp b/sycl/source/detail/sycl_mem_obj_t.cpp index 7440a3b816ce2..68207bec67d53 100644 --- a/sycl/source/detail/sycl_mem_obj_t.cpp +++ b/sycl/source/detail/sycl_mem_obj_t.cpp @@ -209,8 +209,9 @@ void SYCLMemObjT::detachMemoryObject( !MOwnNativeHandle || (MInteropContext && !MInteropContext->isOwnedByRuntime()); - if (MRecord && MRecord->MCurContext && MRecord->MCurContext->isOwnedByRuntime() && - !InteropObjectsUsed && (!MHostPtrProvided || MIsInternal)) { + if (MRecord && MRecord->MCurContext && + MRecord->MCurContext->isOwnedByRuntime() && !InteropObjectsUsed && + (!MHostPtrProvided || MIsInternal)) { bool okToDefer = GlobalHandler::instance().isOkToDefer(); if (okToDefer) Scheduler::getInstance().deferMemObjRelease(Self); diff --git a/sycl/source/detail/usm/usm_impl.cpp b/sycl/source/detail/usm/usm_impl.cpp index 753c27d5f678d..57c54275069e6 100755 --- a/sycl/source/detail/usm/usm_impl.cpp +++ b/sycl/source/detail/usm/usm_impl.cpp @@ -73,33 +73,33 @@ void *alignedAllocHost(size_t Alignment, size_t Size, const context &Ctxt, return nullptr; std::shared_ptr CtxImpl = detail::getSyclObjImpl(Ctxt); - pi_context C = CtxImpl->getHandleRef(); - const PluginPtr &Plugin = CtxImpl->getPlugin(); - pi_result Error = PI_ERROR_INVALID_VALUE; - - switch (Kind) { - case alloc::host: { - std::array Props; - auto PropsIter = Props.begin(); - - if (PropList.has_property() && - Ctxt.get_platform().has_extension( - "cl_intel_mem_alloc_buffer_location")) { - *PropsIter++ = PI_MEM_USM_ALLOC_BUFFER_LOCATION; - *PropsIter++ = PropList - .get_property() - .get_buffer_location(); - } + pi_context C = CtxImpl->getHandleRef(); + const PluginPtr &Plugin = CtxImpl->getPlugin(); + pi_result Error = PI_ERROR_INVALID_VALUE; + + switch (Kind) { + case alloc::host: { + std::array Props; + auto PropsIter = Props.begin(); + + if (PropList.has_property< + sycl::ext::intel::experimental::property::usm::buffer_location>() && + Ctxt.get_platform().has_extension( + "cl_intel_mem_alloc_buffer_location")) { + *PropsIter++ = PI_MEM_USM_ALLOC_BUFFER_LOCATION; + *PropsIter++ = PropList + .get_property() + .get_buffer_location(); + } - assert(PropsIter >= Props.begin() && PropsIter < Props.end()); - *PropsIter++ = 0; // null-terminate property list + assert(PropsIter >= Props.begin() && PropsIter < Props.end()); + *PropsIter++ = 0; // null-terminate property list - Error = Plugin->call_nocheck( - &RetVal, C, Props.data(), Size, Alignment); + Error = Plugin->call_nocheck( + &RetVal, C, Props.data(), Size, Alignment); - break; + break; } case alloc::device: case alloc::shared: diff --git a/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp b/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp index 1947e31b7daaa..e1bc8c894f311 100644 --- a/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp +++ b/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp @@ -83,8 +83,7 @@ class DependsOnTests : public ::testing::Test { detail::Command *NewCmd = MS.addCG( std::move(CmdGroup), - Type == TestCGType::HOST_TASK ? nullptr : QueueDevImpl, - ToEnqueue); + Type == TestCGType::HOST_TASK ? nullptr : QueueDevImpl, ToEnqueue); EXPECT_EQ(ToEnqueue.size(), 0u); return NewCmd; } diff --git a/sycl/unittests/scheduler/GraphCleanup.cpp b/sycl/unittests/scheduler/GraphCleanup.cpp index 437f98b1579a6..c3681bfc07a3b 100644 --- a/sycl/unittests/scheduler/GraphCleanup.cpp +++ b/sycl/unittests/scheduler/GraphCleanup.cpp @@ -245,7 +245,8 @@ TEST_F(SchedulerTest, PostEnqueueCleanup) { checkCleanupOnLeafUpdate( MS, QueueImpl, Buf, MockReq, [&](detail::MemObjRecord *Record) { detail::Command *Leaf = *Record->MWriteLeaves.begin(); - MS.addEmptyCmd(Leaf, {&MockReq}, detail::Command::BlockReason::HostTask, ToEnqueue); + MS.addEmptyCmd(Leaf, {&MockReq}, detail::Command::BlockReason::HostTask, + ToEnqueue); }); checkCleanupOnLeafUpdate( MS, nullptr, Buf, MockReq, [&](detail::MemObjRecord *Record) { diff --git a/sycl/unittests/scheduler/InOrderQueueDeps.cpp b/sycl/unittests/scheduler/InOrderQueueDeps.cpp index bffdf6af4afe2..9ce9a1f944349 100644 --- a/sycl/unittests/scheduler/InOrderQueueDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueDeps.cpp @@ -91,8 +91,7 @@ TEST_F(SchedulerTest, InOrderQueueDeps) { // Check that sequential memory movements submitted to the same in-order // queue do not depend on each other. - detail::Command *Cmd = - MS.insertMemoryMove(Record, &Req, nullptr, AuxCmds); + detail::Command *Cmd = MS.insertMemoryMove(Record, &Req, nullptr, AuxCmds); detail::EnqueueResultT Res; auto ReadLock = MS.acquireGraphReadLock(); MockScheduler::enqueueCommand(Cmd, Res, detail::NON_BLOCKING); diff --git a/sycl/unittests/scheduler/LeafLimitDiffContexts.cpp b/sycl/unittests/scheduler/LeafLimitDiffContexts.cpp index 71f30f91117a0..565c3b2a2314c 100644 --- a/sycl/unittests/scheduler/LeafLimitDiffContexts.cpp +++ b/sycl/unittests/scheduler/LeafLimitDiffContexts.cpp @@ -60,8 +60,8 @@ TEST_F(SchedulerTest, LeafLimitDiffContexts) { std::vector ToEnqueue; AllocaCmd = MS.getOrCreateAllocaForReq( Rec, &MockReq, detail::getSyclObjImpl(Queue), ToEnqueue); - std::ignore = MS.getOrCreateAllocaForReq( - Rec, &MockReq, nullptr, ToEnqueue); + std::ignore = + MS.getOrCreateAllocaForReq(Rec, &MockReq, nullptr, ToEnqueue); DepCmd = std::make_unique(detail::getSyclObjImpl(Queue), MockReq); } diff --git a/sycl/unittests/scheduler/LeavesCollection.cpp b/sycl/unittests/scheduler/LeavesCollection.cpp index 39146ffaa95e8..e0732926537b0 100644 --- a/sycl/unittests/scheduler/LeavesCollection.cpp +++ b/sycl/unittests/scheduler/LeavesCollection.cpp @@ -36,8 +36,7 @@ createGenericCommand(const std::shared_ptr &Q) { return std::shared_ptr{new MockCommand(Q, Command::RUN_CG)}; } -std::shared_ptr -createEmptyCommand(const Requirement &Req) { +std::shared_ptr createEmptyCommand(const Requirement &Req) { EmptyCommand *Cmd = new EmptyCommand(); Cmd->addRequirement(/* DepCmd = */ nullptr, /* AllocaCmd = */ nullptr, &Req); Cmd->MBlockReason = Command::BlockReason::HostAccessor; diff --git a/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp b/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp index 6ae6b9bfc2344..b08b211d1e2dc 100644 --- a/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp +++ b/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp @@ -69,8 +69,7 @@ TEST_F(SchedulerTest, LinkedAllocaDependencies) { std::vector &) {}; std::shared_ptr Record{ - new sycl::detail::MemObjRecord(nullptr, 10, - AllocaDep)}; + new sycl::detail::MemObjRecord(nullptr, 10, AllocaDep)}; MemObjMock MemObj(Record); Req.MSYCLMemObj = &MemObj; diff --git a/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp b/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp index 83a0702861141..24a19977844fb 100644 --- a/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp +++ b/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp @@ -152,8 +152,7 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) { // No special handling required: alloca commands are created one after // another and the transfer is done via a write operation. - detail::MemObjRecord *Record = - MS.getOrInsertMemObjRecord(nullptr, &Req); + detail::MemObjRecord *Record = MS.getOrInsertMemObjRecord(nullptr, &Req); std::vector AuxCmds; detail::AllocaCommandBase *HostAllocaCmd = MS.getOrCreateAllocaForReq(Record, &Req, nullptr, AuxCmds); diff --git a/sycl/unittests/scheduler/QueueFlushing.cpp b/sycl/unittests/scheduler/QueueFlushing.cpp index 330ff7e0f02d2..c90db25fc019a 100644 --- a/sycl/unittests/scheduler/QueueFlushing.cpp +++ b/sycl/unittests/scheduler/QueueFlushing.cpp @@ -125,14 +125,12 @@ TEST_F(SchedulerTest, QueueFlushing) { detail::AllocaCommand HostAllocaCmd = detail::AllocaCommand(nullptr, MockReq); - detail::MemCpyCommand MemCpyCmd{MockReq, &AllocaCmd, - MockReq, &HostAllocaCmd, - QueueImplA, nullptr}; + detail::MemCpyCommand MemCpyCmd{MockReq, &AllocaCmd, MockReq, + &HostAllocaCmd, QueueImplA, nullptr}; testCommandEnqueue(&MemCpyCmd, QueueImplB, MockReq); - detail::MemCpyCommandHost MemCpyCmdHost{MockReq, &AllocaCmd, - MockReq, &MockHostPtr, - QueueImplA, nullptr}; + detail::MemCpyCommandHost MemCpyCmdHost{MockReq, &AllocaCmd, MockReq, + &MockHostPtr, QueueImplA, nullptr}; testCommandEnqueue(&MemCpyCmdHost, QueueImplB, MockReq); std::unique_ptr CG{ diff --git a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp index d1e7f22aa9485..789961b081da8 100644 --- a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp +++ b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp @@ -11,8 +11,8 @@ #include #include -#include #include +#include using namespace sycl; From c76484daf99edc74b77d6722fdbb4d62b707df56 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 19 Jun 2024 05:56:31 -0700 Subject: [PATCH 31/58] fix clang-format Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/program_impl.cpp | 93 ++++++++-------- sycl/source/detail/usm/usm_impl.cpp | 160 ++++++++++++++-------------- 2 files changed, 126 insertions(+), 127 deletions(-) mode change 100755 => 100644 sycl/source/detail/usm/usm_impl.cpp diff --git a/sycl/source/detail/program_impl.cpp b/sycl/source/detail/program_impl.cpp index df95614d872c3..f3ac2185627f9 100644 --- a/sycl/source/detail/program_impl.cpp +++ b/sycl/source/detail/program_impl.cpp @@ -94,22 +94,22 @@ program_impl::program_impl( } } - std::vector Devices(get_pi_devices()); - std::vector Programs; - bool NonInterOpToLink = false; - for (const auto &Prg : ProgramList) { - if (!Prg->MLinkable && NonInterOpToLink) - continue; - NonInterOpToLink |= !Prg->MLinkable; - Programs.push_back(Prg->MProgram); - } - const PluginPtr &Plugin = getPlugin(); - sycl::detail::pi::PiResult Err = - Plugin->call_nocheck( - MContext->getHandleRef(), Devices.size(), Devices.data(), - LinkOptions.c_str(), Programs.size(), Programs.data(), nullptr, - nullptr, &MProgram); - Plugin->checkPiResult(Err); + std::vector Devices(get_pi_devices()); + std::vector Programs; + bool NonInterOpToLink = false; + for (const auto &Prg : ProgramList) { + if (!Prg->MLinkable && NonInterOpToLink) + continue; + NonInterOpToLink |= !Prg->MLinkable; + Programs.push_back(Prg->MProgram); + } + const PluginPtr &Plugin = getPlugin(); + sycl::detail::pi::PiResult Err = + Plugin->call_nocheck( + MContext->getHandleRef(), Devices.size(), Devices.data(), + LinkOptions.c_str(), Programs.size(), Programs.data(), nullptr, + nullptr, &MProgram); + Plugin->checkPiResult(Err); } program_impl::program_impl(ContextImplPtr Context, @@ -236,22 +236,22 @@ void program_impl::link(std::string LinkOptions) { const char *LinkOpts = SYCLConfig::get(); if (!LinkOpts) { LinkOpts = LinkOptions.c_str(); - } + } - // Plugin resets MProgram with a new pi_program as a result of the call to - // "piProgramLink". Thus, we need to release MProgram before the call to - // piProgramLink. - if (MProgram != nullptr) - Plugin->call(MProgram); - - sycl::detail::pi::PiResult Err = - Plugin->call_nocheck( - MContext->getHandleRef(), Devices.size(), Devices.data(), LinkOpts, - /*num_input_programs*/ 1, &MProgram, nullptr, nullptr, &MProgram); - Plugin->checkPiResult(Err); - MLinkOptions = LinkOptions; - MBuildOptions = LinkOptions; - MState = program_state::linked; + // Plugin resets MProgram with a new pi_program as a result of the call to + // "piProgramLink". Thus, we need to release MProgram before the call to + // piProgramLink. + if (MProgram != nullptr) + Plugin->call(MProgram); + + sycl::detail::pi::PiResult Err = + Plugin->call_nocheck( + MContext->getHandleRef(), Devices.size(), Devices.data(), LinkOpts, + /*num_input_programs*/ 1, &MProgram, nullptr, nullptr, &MProgram); + Plugin->checkPiResult(Err); + MLinkOptions = LinkOptions; + MBuildOptions = LinkOptions; + MState = program_state::linked; } bool program_impl::has_kernel(std::string KernelName, @@ -363,24 +363,23 @@ std::pair program_impl::get_pi_kernel_arg_mask_pair(const std::string &KernelName) const { std::pair Result; - const PluginPtr &Plugin = getPlugin(); - sycl::detail::pi::PiResult Err = - Plugin->call_nocheck( - MProgram, KernelName.c_str(), &Result.first); - if (Err == PI_ERROR_INVALID_KERNEL_NAME) { - throw invalid_object_error( - "This instance of program does not contain the kernel requested", - Err); - } - Plugin->checkPiResult(Err); + const PluginPtr &Plugin = getPlugin(); + sycl::detail::pi::PiResult Err = + Plugin->call_nocheck( + MProgram, KernelName.c_str(), &Result.first); + if (Err == PI_ERROR_INVALID_KERNEL_NAME) { + throw invalid_object_error( + "This instance of program does not contain the kernel requested", Err); + } + Plugin->checkPiResult(Err); - // Some PI Plugins (like OpenCL) require this call to enable USM - // For others, PI will turn this into a NOP. - if (getContextImplPtr()->getPlatformImpl()->supports_usm()) - Plugin->call( - Result.first, PI_USM_INDIRECT_ACCESS, sizeof(pi_bool), &PI_TRUE); + // Some PI Plugins (like OpenCL) require this call to enable USM + // For others, PI will turn this into a NOP. + if (getContextImplPtr()->getPlatformImpl()->supports_usm()) + Plugin->call( + Result.first, PI_USM_INDIRECT_ACCESS, sizeof(pi_bool), &PI_TRUE); - return Result; + return Result; } std::vector diff --git a/sycl/source/detail/usm/usm_impl.cpp b/sycl/source/detail/usm/usm_impl.cpp old mode 100755 new mode 100644 index 57c54275069e6..7237e88be440f --- a/sycl/source/detail/usm/usm_impl.cpp +++ b/sycl/source/detail/usm/usm_impl.cpp @@ -100,20 +100,20 @@ void *alignedAllocHost(size_t Alignment, size_t Size, const context &Ctxt, &RetVal, C, Props.data(), Size, Alignment); break; - } - case alloc::device: - case alloc::shared: - case alloc::unknown: { - RetVal = nullptr; - Error = PI_ERROR_INVALID_VALUE; - break; - } - } + } + case alloc::device: + case alloc::shared: + case alloc::unknown: { + RetVal = nullptr; + Error = PI_ERROR_INVALID_VALUE; + break; + } + } - // Error is for debugging purposes. - // The spec wants a nullptr returned, not an exception. - if (Error != PI_SUCCESS) - return nullptr; + // Error is for debugging purposes. + // The spec wants a nullptr returned, not an exception. + if (Error != PI_SUCCESS) + return nullptr; #ifdef XPTI_ENABLE_INSTRUMENTATION xpti::addMetadata(PrepareNotify.traceEvent(), "memory_ptr", reinterpret_cast(RetVal)); @@ -139,79 +139,79 @@ void *alignedAllocInternal(size_t Alignment, size_t Size, if (Size == 0) return nullptr; - pi_context C = CtxImpl->getHandleRef(); - const PluginPtr &Plugin = CtxImpl->getPlugin(); - pi_result Error = PI_ERROR_INVALID_VALUE; - pi_device Id; + pi_context C = CtxImpl->getHandleRef(); + const PluginPtr &Plugin = CtxImpl->getPlugin(); + pi_result Error = PI_ERROR_INVALID_VALUE; + pi_device Id; - switch (Kind) { - case alloc::device: { - Id = DevImpl->getHandleRef(); + switch (Kind) { + case alloc::device: { + Id = DevImpl->getHandleRef(); - std::array Props; - auto PropsIter = Props.begin(); + std::array Props; + auto PropsIter = Props.begin(); - // Buffer location is only supported on FPGA devices - if (PropList.has_property() && - DevImpl->has_extension("cl_intel_mem_alloc_buffer_location")) { - *PropsIter++ = PI_MEM_USM_ALLOC_BUFFER_LOCATION; - *PropsIter++ = PropList - .get_property() - .get_buffer_location(); - } + // Buffer location is only supported on FPGA devices + if (PropList.has_property< + sycl::ext::intel::experimental::property::usm::buffer_location>() && + DevImpl->has_extension("cl_intel_mem_alloc_buffer_location")) { + *PropsIter++ = PI_MEM_USM_ALLOC_BUFFER_LOCATION; + *PropsIter++ = PropList + .get_property() + .get_buffer_location(); + } - assert(PropsIter >= Props.begin() && PropsIter < Props.end()); - *PropsIter++ = 0; // null-terminate property list + assert(PropsIter >= Props.begin() && PropsIter < Props.end()); + *PropsIter++ = 0; // null-terminate property list - Error = Plugin->call_nocheck( - &RetVal, C, Id, Props.data(), Size, Alignment); + Error = Plugin->call_nocheck( + &RetVal, C, Id, Props.data(), Size, Alignment); - break; - } - case alloc::shared: { - Id = DevImpl->getHandleRef(); - - std::array Props; - auto PropsIter = Props.begin(); - - if (PropList.has_property< - sycl::ext::oneapi::property::usm::device_read_only>()) { - *PropsIter++ = PI_MEM_ALLOC_FLAGS; - *PropsIter++ = PI_MEM_ALLOC_DEVICE_READ_ONLY; - } - - if (PropList.has_property() && - DevImpl->has_extension("cl_intel_mem_alloc_buffer_location")) { - *PropsIter++ = PI_MEM_USM_ALLOC_BUFFER_LOCATION; - *PropsIter++ = PropList - .get_property() - .get_buffer_location(); - } - - assert(PropsIter >= Props.begin() && PropsIter < Props.end()); - *PropsIter++ = 0; // null-terminate property list - - Error = Plugin->call_nocheck( - &RetVal, C, Id, Props.data(), Size, Alignment); - - break; - } - case alloc::host: - case alloc::unknown: { - RetVal = nullptr; - Error = PI_ERROR_INVALID_VALUE; - break; + break; + } + case alloc::shared: { + Id = DevImpl->getHandleRef(); + + std::array Props; + auto PropsIter = Props.begin(); + + if (PropList.has_property< + sycl::ext::oneapi::property::usm::device_read_only>()) { + *PropsIter++ = PI_MEM_ALLOC_FLAGS; + *PropsIter++ = PI_MEM_ALLOC_DEVICE_READ_ONLY; } + + if (PropList.has_property< + sycl::ext::intel::experimental::property::usm::buffer_location>() && + DevImpl->has_extension("cl_intel_mem_alloc_buffer_location")) { + *PropsIter++ = PI_MEM_USM_ALLOC_BUFFER_LOCATION; + *PropsIter++ = PropList + .get_property() + .get_buffer_location(); } - // Error is for debugging purposes. - // The spec wants a nullptr returned, not an exception. - if (Error != PI_SUCCESS) - return nullptr; + assert(PropsIter >= Props.begin() && PropsIter < Props.end()); + *PropsIter++ = 0; // null-terminate property list + + Error = Plugin->call_nocheck( + &RetVal, C, Id, Props.data(), Size, Alignment); + + break; + } + case alloc::host: + case alloc::unknown: { + RetVal = nullptr; + Error = PI_ERROR_INVALID_VALUE; + break; + } + } + + // Error is for debugging purposes. + // The spec wants a nullptr returned, not an exception. + if (Error != PI_SUCCESS) + return nullptr; return RetVal; } @@ -250,9 +250,9 @@ void *alignedAlloc(size_t Alignment, size_t Size, const context &Ctxt, void freeInternal(void *Ptr, const context_impl *CtxImpl) { if (Ptr == nullptr) return; - pi_context C = CtxImpl->getHandleRef(); - const PluginPtr &Plugin = CtxImpl->getPlugin(); - Plugin->call(C, Ptr); + pi_context C = CtxImpl->getHandleRef(); + const PluginPtr &Plugin = CtxImpl->getPlugin(); + Plugin->call(C, Ptr); } void free(void *Ptr, const context &Ctxt, From 61d1c6208e4ef52c3b72908b9f904ba9869ffdb5 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 19 Jun 2024 08:52:31 -0700 Subject: [PATCH 32/58] fix connect task queue Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/scheduler/graph_builder.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 2ac97baefb543..7cfc0446fdd69 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -1330,7 +1330,8 @@ Command *Scheduler::GraphBuilder::connectDepEvent( try { std::unique_ptr HT(new detail::HostTask); std::unique_ptr ConnectCG(new detail::CGHostTask( - std::move(HT), /* Queue = */ {}, /* Context = */ {}, /* Args = */ {}, + std::move(HT), /* Queue = */ Cmd->getQueue(), /* Context = */ {}, + /* Args = */ {}, detail::CG::StorageInitHelper( /* ArgsStorage = */ {}, /* AccStorage = */ {}, /* SharedPtrStorage = */ {}, /* Requirements = */ {}, From 5814e466577f0b99d6d6095d3e0d68a25452203c Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Thu, 20 Jun 2024 06:30:09 -0700 Subject: [PATCH 33/58] fix bugs Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/event_impl.cpp | 11 +++++++++-- sycl/source/detail/queue_impl.cpp | 4 ++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index 7d91129f25b51..a270867f6b637 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -262,7 +262,8 @@ void event_impl::wait_and_throw( void event_impl::checkProfilingPreconditions() const { std::weak_ptr EmptyPtr; - if (!EmptyPtr.owner_before(MQueue) && !MQueue.owner_before(EmptyPtr)) { + if (!MIsHostEvent && !EmptyPtr.owner_before(MQueue) && + !MQueue.owner_before(EmptyPtr)) { throw sycl::exception(make_error_code(sycl::errc::invalid), "Profiling information is unavailable as the event " "has no associated queue."); @@ -300,7 +301,7 @@ event_impl::get_profiling_info() { // made by forcing the re-sync of submit time to start time is less than // 0.5ms. These timing values were obtained empirically using an integrated // Intel GPU). - if (MEventFromSubmittedExecCommandBuffer && MEvent) { + if (MEventFromSubmittedExecCommandBuffer && !MIsHostEvent && MEvent) { uint64_t StartTime = get_event_profiling_info( this->getHandleRef(), this->getPlugin()); @@ -546,6 +547,12 @@ void event_impl::setSubmissionTime() { e.what()); std::rethrow_exception(std::current_exception()); } + } else { + // Returning host time + using namespace std::chrono; + MSubmitTime = + duration_cast(steady_clock::now().time_since_epoch()) + .count(); } } else { // Capture the host timestamp for a return value of function call diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 83f33688ed0b1..572b0b8cf568a 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -299,12 +299,12 @@ void queue_impl::addEvent(const event &Event) { // if there is no command on the event, we cannot track it with MEventsWeak // as that will leave it with no owner. Track in MEventsShared only if we're // unable to call piQueueFinish during wait. - if (EImpl->isHost() || MEmulateOOO) + if (MEmulateOOO) addSharedEvent(Event); } // As long as the queue supports piQueueFinish we only need to store events // for unenqueued commands and host tasks. - else if (EImpl->isHost() || MEmulateOOO || EImpl->getHandleRef() == nullptr) { + else if (MEmulateOOO || EImpl->getHandleRef() == nullptr) { std::weak_ptr EventWeakPtr{EImpl}; std::lock_guard Lock{MMutex}; MEventsWeak.push_back(std::move(EventWeakPtr)); From a03468173acf6f9c58593685069d030955a4782c Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Thu, 20 Jun 2024 09:43:06 -0700 Subject: [PATCH 34/58] fix work with graph Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/event_impl.cpp | 4 ++-- sycl/source/detail/queue_impl.cpp | 21 ++++++++++++++++----- sycl/source/detail/queue_impl.hpp | 16 +++++++++++++--- sycl/source/detail/scheduler/commands.cpp | 20 ++++++++++---------- 4 files changed, 41 insertions(+), 20 deletions(-) diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index a270867f6b637..e203924d2d612 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -48,7 +48,7 @@ event_impl::~event_impl() { } void event_impl::waitInternal(bool *Success) { - if (MEvent) { + if (!MIsHostEvent && MEvent) { // Wait for the native event sycl::detail::pi::PiResult Err = getPlugin()->call_nocheck(1, &MEvent); @@ -390,7 +390,7 @@ event_impl::get_info() { return sycl::info::event_command_status::submitted; } - return MState.load() != HES_Complete + return MIsHostEvent && MState.load() != HES_Complete ? sycl::info::event_command_status::submitted : info::event_command_status::complete; } diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 572b0b8cf568a..a5f9ae9964ac6 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -696,6 +696,19 @@ void queue_impl::revisitUnenqueuedCommandsState( const EventImplPtr &CompletedHostTask) { if (MIsInorder) return; + + std::unique_lock Lock{MMutex, std::try_to_lock}; + if (Lock.owns_lock()) + doUnenqueuedCommandCleanup(CompletedHostTask->getCommandGraph()); + else { + std::lock_guard RequestLock(MMissedCleanupRequestsMtx); + MMissedCleanupRequests.push_back(CompletedHostTask->getCommandGraph()); + } +} + +void queue_impl::doUnenqueuedCommandCleanup( + const std::shared_ptr + &Graph) { auto tryToCleanup = [](DependencyTrackingItems &Deps) { if (Deps.LastBarrier && Deps.LastBarrier->isEnqueued()) { Deps.LastBarrier = nullptr; @@ -713,14 +726,12 @@ void queue_impl::revisitUnenqueuedCommandsState( Deps.UnenqueuedCmdEvents.end()); } }; - std::lock_guard Lock{MMutex}; // Barrier enqueue could be significantly postponed due to host task // dependency if any. No guarantee that it will happen while same graph deps // are still recording. - if (auto Graph = CompletedHostTask->getCommandGraph()) { - if (Graph == getCommandGraph()) - tryToCleanup(MExtGraphDeps); - } else + if (Graph && Graph == getCommandGraph()) + tryToCleanup(MExtGraphDeps); + else tryToCleanup(MDefaultGraphDeps); } diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index d0a74cc80c793..aa3dd9fc780bf 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -93,7 +93,7 @@ class queue_impl { /// \param PropList is a list of properties to use for queue construction. queue_impl(const DeviceImplPtr &Device, const async_handler &AsyncHandler, const property_list &PropList) - : queue_impl(Device, getDefaultOrNew(Device), AsyncHandler, PropList){}; + : queue_impl(Device, getDefaultOrNew(Device), AsyncHandler, PropList) {}; /// Constructs a SYCL queue with an async_handler and property_list provided /// form a device and a context. @@ -749,6 +749,9 @@ class queue_impl { // tasks and host tasks is applicable for out of order queues only. Not neede // for in order ones. void revisitUnenqueuedCommandsState(const EventImplPtr &CompletedHostTask); + void doUnenqueuedCommandCleanup( + const std::shared_ptr + &Graph); static ContextImplPtr getContext(const QueueImplPtr &Queue) { return Queue ? Queue->getContextImplPtr() : nullptr; @@ -790,13 +793,12 @@ class queue_impl { EventToBuildDeps = getSyclObjImpl(EventRet); } else { const CG::CGTYPE Type = Handler.getType(); - + std::lock_guard Lock{MMutex}; // The following code supports barrier synchronization if host task is // involved in the scenario. Native barriers cannot handle host task // dependency so in the case where some commands were not enqueued // (blocked), we track them to prevent barrier from being enqueued // earlier. - std::lock_guard Lock{MMutex}; auto &Deps = MGraph.expired() ? MDefaultGraphDeps : MExtGraphDeps; if (Type == CG::Barrier && !Deps.UnenqueuedCmdEvents.empty()) { Handler.depends_on(Deps.UnenqueuedCmdEvents); @@ -814,6 +816,10 @@ class queue_impl { } else Deps.UnenqueuedCmdEvents.push_back(EventRetImpl); } + std::lock_guard RequestLock(MMissedCleanupRequestsMtx); + for (auto &UpdatedGraph : MMissedCleanupRequests) + doUnenqueuedCommandCleanup(UpdatedGraph); + MMissedCleanupRequests.clear(); } } @@ -966,6 +972,10 @@ class queue_impl { unsigned long long MQueueID; static std::atomic MNextAvailableQueueID; + std::deque> + MMissedCleanupRequests; + std::mutex MMissedCleanupRequestsMtx; + friend class sycl::ext::oneapi::experimental::detail::node_impl; }; diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 3d51fe7a1c12f..6322b904fd6bc 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2954,16 +2954,16 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { Plugin->call(RawEvents.size(), &RawEvents[0]); } - assert(MQueue->getDeviceImplPtr()->getBackend() == - backend::ext_intel_esimd_emulator); - if (MEvent != nullptr) - MEvent->setHostEnqueueTime(); - MQueue->getPlugin()->call( - nullptr, - reinterpret_cast(ExecKernel->MHostKernel->getPtr()), - NDRDesc.Dims, &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0], - &NDRDesc.LocalSize[0], 0, nullptr, nullptr); - return PI_SUCCESS; + assert(MQueue->getDeviceImplPtr()->getBackend() == + backend::ext_intel_esimd_emulator); + if (MEvent != nullptr) + MEvent->setHostEnqueueTime(); + MQueue->getPlugin()->call( + nullptr, + reinterpret_cast(ExecKernel->MHostKernel->getPtr()), + NDRDesc.Dims, &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0], + &NDRDesc.LocalSize[0], 0, nullptr, nullptr); + return PI_SUCCESS; } auto getMemAllocationFunc = [this](Requirement *Req) { From c274c5ec74a0e92306824194a7f5ef9509c83df2 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Thu, 20 Jun 2024 10:14:54 -0700 Subject: [PATCH 35/58] fix tracing tests Signed-off-by: Tikhomirova, Kseniya --- .../Tracing/code_location_queue_submit.cpp | 13 +++---------- sycl/test-e2e/Tracing/task_execution.cpp | 18 ++++++------------ .../Tracing/task_execution_handler.cpp | 4 ++-- 3 files changed, 11 insertions(+), 24 deletions(-) diff --git a/sycl/test-e2e/Tracing/code_location_queue_submit.cpp b/sycl/test-e2e/Tracing/code_location_queue_submit.cpp index 6ebfe43e936e5..ce780f5e81725 100644 --- a/sycl/test-e2e/Tracing/code_location_queue_submit.cpp +++ b/sycl/test-e2e/Tracing/code_location_queue_submit.cpp @@ -5,8 +5,7 @@ // Test tracing of the code location data for queue.submit in case of failure // (exception generation) -// First queue creation (id = 0) is queue created on line 15. -// The second queue is a host queue created on first scheduler usage. +// First queue creation (id = 0) is queue created on line 17. #include #include @@ -19,16 +18,10 @@ int main() { unsigned char *HostAllocDst = NULL; // CHECK: [SYCL] Queue create: // CHECK-DAG: queue_handle : {{.*}} - // CHECK-DAG: queue_id : 0 - // CHECK-DAG: is_inorder : false - // CHECK-DAG: sycl_device : {{.*}} - // CHECK-DAG: sycl_device_name : {{.*}} - // CHECK-DAG: sycl_context : {{.*}} - // CHECK-NEXT: [SYCL] Queue create: // CHECK-DAG: queue_id : 1 // CHECK-DAG: is_inorder : false // CHECK-DAG: sycl_device : {{.*}} - // CHECK-DAG: sycl_device_name : SYCL host device + // CHECK-DAG: sycl_device_name : {{.*}} // CHECK-DAG: sycl_context : {{.*}} // CHECK: [SYCL] Runtime reports: // CHECK-NEXT: what: NULL pointer argument in memory copy operation. -30 (PI_ERROR_INVALID_VALUE) @@ -44,6 +37,6 @@ int main() { sycl::free(HostAllocSrc, Q); } // CHECK-NEXT: [SYCL] Queue destroy: - // CHECK-DAG: queue_id : 0 + // CHECK-DAG: queue_id : 1 return !ExceptionCaught; } diff --git a/sycl/test-e2e/Tracing/task_execution.cpp b/sycl/test-e2e/Tracing/task_execution.cpp index d591c20b8f6c0..b4932df0eda55 100644 --- a/sycl/test-e2e/Tracing/task_execution.cpp +++ b/sycl/test-e2e/Tracing/task_execution.cpp @@ -15,38 +15,32 @@ int main() { Q.copy(AllocDst, AllocSrc, 1).wait(); // CHECK: [SYCL] Queue create: // CHECK-DAG: queue_handle : {{.*}} - // CHECK-DAG: queue_id : 0 + // CHECK-DAG: queue_id : 1 // CHECK-DAG: is_inorder : false // CHECK-DAG: sycl_device : {{.*}} // CHECK-DAG: sycl_device_name : {{.*}} // CHECK-DAG: sycl_context : {{.*}} // CHECK-NEXT: [SYCL] Task begin (event={{.*}},instanceID={{.*}}) - // CHECK-DAG: queue_id : 0 + // CHECK-DAG: queue_id : 1 // CHECK-DAG: memory_size : 1 // CHECK-DAG: value_set : 0 // CHECK-DAG: memory_ptr : {{.*}} // CHECK-DAG: sycl_device : {{.*}} // CHECK-NEXT: [SYCL] Task end (event={{.*}},instanceID={{.*}}) // CHECK-NEXT: [SYCL] Task begin (event={{.*}},instanceID={{.*}}) - // CHECK-DAG: queue_id : 0 + // CHECK-DAG: queue_id : 1 // CHECK-DAG: memory_size : 1 // CHECK-DAG: dest_memory_ptr : {{.*}} // CHECK-DAG: src_memory_ptr : {{.*}} // CHECK-DAG: sycl_device : {{.*}} // CHECK-NEXT: [SYCL] Task end (event={{.*}},instanceID={{.*}}) - // CHECK-NEXT: [SYCL] Queue create: - // CHECK-DAG: queue_id : 1 - // CHECK-DAG: is_inorder : false - // CHECK-DAG: sycl_device : {{.*}} - // CHECK-DAG: sycl_device_name : SYCL host device - // CHECK-DAG: sycl_context : {{.*}} Q.single_task([]() {}).wait(); // CHECK-NEXT: [SYCL] Task begin (event={{.*}},instanceID={{.*}}) // CHECK-DAG: enqueue_kernel_data : {{.*}} // CHECK-DAG: sym_column_no : {{.*}} - // CHECK-DAG: sym_line_no : 43 + // CHECK-DAG: sym_line_no : 37 // CHECK-DAG: sym_source_file_name : {{.*}}task_execution.cpp - // CHECK-DAG: queue_id : 0 + // CHECK-DAG: queue_id : 1 // CHECK-DAG: sym_function_name : typeinfo name for main::E2ETestKernel // CHECK-DAG: from_source : {{.*}} // CHECK-DAG: sycl_device_name : {{.*}} @@ -55,7 +49,7 @@ int main() { // CHECK-DAG: sycl_device : {{.*}} // CHECK-NEXT: [SYCL] Task end (event={{.*}},instanceID={{.*}}) // CHECK-NEXT: [SYCL] Queue destroy: - // CHECK-DAG: queue_id : 0 + // CHECK-DAG: queue_id : 1 sycl::free(AllocSrc, Q); sycl::free(AllocDst, Q); } diff --git a/sycl/test-e2e/Tracing/task_execution_handler.cpp b/sycl/test-e2e/Tracing/task_execution_handler.cpp index 0563275f81312..a208fe6655bda 100644 --- a/sycl/test-e2e/Tracing/task_execution_handler.cpp +++ b/sycl/test-e2e/Tracing/task_execution_handler.cpp @@ -16,7 +16,7 @@ int main() { { cgh.memset(AllocSrc, 0, 1); }) .wait(); // CHECK: [SYCL] Task begin (event={{.*}},instanceID={{.*}}) - // CHECK-DAG: queue_id : 0 + // CHECK-DAG: queue_id : 1 // CHECK-DAG: sym_column_no : {{.*}} // CHECK-DAG: sym_function_name : {{.*}} // CHECK-DAG: kernel_name : {{.*}} @@ -27,7 +27,7 @@ int main() { // CHECK-DAG: sycl_device : {{.*}} // CHECK-NEXT: [SYCL] Task end (event={{.*}},instanceID={{.*}}) // CHECK-NEXT: [SYCL] Task begin (event={{.*}},instanceID={{.*}}) - // CHECK-DAG: queue_id : 0 + // CHECK-DAG: queue_id : 1 // CHECK-DAG: sym_column_no : {{.*}} // CHECK-DAG: sym_function_name : {{.*}} // CHECK-DAG: kernel_name : {{.*}} From f50526bf29351cbc0d897ae6a59c699aca910522 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Fri, 21 Jun 2024 04:23:03 -0700 Subject: [PATCH 36/58] fix test Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/scheduler/scheduler.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index 4d26c2a822457..905ca889aaf0d 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -207,7 +207,7 @@ EventImplPtr Scheduler::addCopyBack(Requirement *Req) { { WriteLockT Lock = acquireWriteLock(); NewCmd = MGraphBuilder.addCopyBack(Req, AuxiliaryCmds); - // Command was not creted because there were no operations with + // Command was not created because there were no operations with // buffer. if (!NewCmd) return nullptr; @@ -232,7 +232,9 @@ EventImplPtr Scheduler::addCopyBack(Requirement *Req) { throw runtime_error("Enqueue process failed.", PI_ERROR_INVALID_OPERATION); } catch (...) { - NewCmd->getQueue()->reportAsyncException(std::current_exception()); + auto WorkerQueue = NewCmd->getEvent()->getWorkerQueue(); + assert(WorkerQueue && "WorkerQueue for CopyBack command must be not null"); + WorkerQueue->reportAsyncException(std::current_exception()); } EventImplPtr NewEvent = NewCmd->getEvent(); cleanupCommands(ToCleanUp); From 2bd06e3a3ab0170ce0dfef9ace4ae16573ce7c69 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Mon, 24 Jun 2024 04:17:25 -0700 Subject: [PATCH 37/58] update win symbols Signed-off-by: Tikhomirova, Kseniya --- sycl/test/abi/sycl_symbols_windows.dump | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index e8610211e8572..c091a7751a0cc 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -41,18 +41,12 @@ ??$get_info@U?$max_work_groups@$00@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$id@$00@23@XZ ??$get_info@U?$max_work_groups@$01@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$id@$01@23@XZ ??$get_info@U?$max_work_groups@$02@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$id@$02@23@XZ +??$get_info@U?$sub_group_progress_capabilities@$01@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ ??$get_info@U?$sub_group_progress_capabilities@$02@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ -??$get_info@U?$work_item_progress_capabilities@$00@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ ??$get_info@U?$work_group_progress_capabilities@$02@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ -??$get_info_impl@U?$work_item_progress_capabilities@$01@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ +??$get_info@U?$work_item_progress_capabilities@$00@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ ??$get_info@U?$work_item_progress_capabilities@$01@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ -??$get_info_impl@U?$work_item_progress_capabilities@$02@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ ??$get_info@U?$work_item_progress_capabilities@$02@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ -??$get_info_impl@U?$sub_group_progress_capabilities@$02@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ -??$get_info_impl@U?$work_item_progress_capabilities@$00@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ -??$get_info@U?$sub_group_progress_capabilities@$01@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ -??$get_info_impl@U?$work_group_progress_capabilities@$02@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ -??$get_info_impl@U?$sub_group_progress_capabilities@$01@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ ??$get_info@Uarchitecture@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AW4architecture@experimental@oneapi@ext@23@XZ ??$get_info@Uatomic_fence_order_capabilities@context@info@_V1@sycl@@@context@_V1@sycl@@QEBA?AV?$vector@W4memory_order@_V1@sycl@@V?$allocator@W4memory_order@_V1@sycl@@@std@@@std@@XZ ??$get_info@Uatomic_fence_scope_capabilities@context@info@_V1@sycl@@@context@_V1@sycl@@QEBA?AV?$vector@W4memory_scope@_V1@sycl@@V?$allocator@W4memory_scope@_V1@sycl@@@std@@@std@@XZ @@ -108,6 +102,12 @@ ??$get_info_impl@U?$max_work_item_sizes@$00@device@info@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$range@$00@12@XZ ??$get_info_impl@U?$max_work_item_sizes@$01@device@info@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$range@$01@12@XZ ??$get_info_impl@U?$max_work_item_sizes@$02@device@info@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$range@$02@12@XZ +??$get_info_impl@U?$sub_group_progress_capabilities@$01@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ +??$get_info_impl@U?$sub_group_progress_capabilities@$02@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ +??$get_info_impl@U?$work_group_progress_capabilities@$02@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ +??$get_info_impl@U?$work_item_progress_capabilities@$00@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ +??$get_info_impl@U?$work_item_progress_capabilities@$01@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ +??$get_info_impl@U?$work_item_progress_capabilities@$02@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ ??$get_info_impl@Uaddress_bits@device@info@_V1@sycl@@@device@_V1@sycl@@AEBAIXZ ??$get_info_impl@Uarchitecture@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA?AW4architecture@experimental@oneapi@ext@12@XZ ??$get_info_impl@Uaspects@device@info@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@W4aspect@_V1@sycl@@V?$allocator@W4aspect@_V1@sycl@@@std@@@std@@XZ @@ -4080,7 +4080,6 @@ ?ext_intel_read_host_pipe@handler@_V1@sycl@@AEAAXVstring_view@detail@23@PEAX_K_N@Z ?ext_intel_write_host_pipe@handler@_V1@sycl@@AEAAXAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEAX_K_N@Z ?ext_intel_write_host_pipe@handler@_V1@sycl@@AEAAXVstring_view@detail@23@PEAX_K_N@Z -?verifyDeviceHasProgressGuarantee@handler@_V1@sycl@@AEAAXW4forward_progress_guarantee@experimental@oneapi@ext@23@W4execution_scope@56723@1@Z ?ext_oneapi_advise_usm_cmd_buffer@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAU_pi_ext_command_buffer@@PEBX_KW4_pi_mem_advice@@V?$vector@IV?$allocator@I@std@@@6@PEAI@Z ?ext_oneapi_architecture_is@device@_V1@sycl@@QEAA_NW4arch_category@experimental@oneapi@ext@23@@Z ?ext_oneapi_architecture_is@device@_V1@sycl@@QEAA_NW4architecture@experimental@oneapi@ext@23@@Z @@ -4096,7 +4095,6 @@ ?ext_oneapi_copy@handler@_V1@sycl@@QEAAXUimage_mem_handle@experimental@oneapi@ext@23@0AEBUimage_descriptor@56723@@Z ?ext_oneapi_copy@handler@_V1@sycl@@QEAAXUimage_mem_handle@experimental@oneapi@ext@23@PEAXAEBUimage_descriptor@56723@@Z ?ext_oneapi_copy@handler@_V1@sycl@@QEAAXUimage_mem_handle@experimental@oneapi@ext@23@V?$range@$02@23@AEBUimage_descriptor@56723@PEAX111@Z -?ext_oneapi_prod@queue@_V1@sycl@@QEAAXXZ ?ext_oneapi_copy@queue@_V1@sycl@@QEAA?AVevent@23@PEAX0AEBUimage_descriptor@experimental@oneapi@ext@23@_KAEBUcode_location@detail@23@@Z ?ext_oneapi_copy@queue@_V1@sycl@@QEAA?AVevent@23@PEAX0AEBUimage_descriptor@experimental@oneapi@ext@23@_KAEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@AEBUcode_location@detail@23@@Z ?ext_oneapi_copy@queue@_V1@sycl@@QEAA?AVevent@23@PEAX0AEBUimage_descriptor@experimental@oneapi@ext@23@_KV423@AEBUcode_location@detail@23@@Z @@ -4158,6 +4156,7 @@ ?ext_oneapi_owner_before@?$OwnerLessBase@Vstream@_V1@sycl@@@detail@_V1@sycl@@QEBA_NAEBV?$weak_object_base@Vstream@_V1@sycl@@@2oneapi@ext@34@@Z ?ext_oneapi_owner_before@?$OwnerLessBase@Vstream@_V1@sycl@@@detail@_V1@sycl@@QEBA_NAEBVstream@34@@Z ?ext_oneapi_prefetch_usm_cmd_buffer@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAU_pi_ext_command_buffer@@PEAX_KV?$vector@IV?$allocator@I@std@@@6@PEAI@Z +?ext_oneapi_prod@queue@_V1@sycl@@QEAAXXZ ?ext_oneapi_set_external_event@queue@_V1@sycl@@QEAAXAEBVevent@23@@Z ?ext_oneapi_signal_external_semaphore@handler@_V1@sycl@@QEAAXUinterop_semaphore_handle@experimental@oneapi@ext@23@@Z ?ext_oneapi_signal_external_semaphore@handler@_V1@sycl@@QEAAXUinterop_semaphore_handle@experimental@oneapi@ext@23@_K@Z @@ -4205,6 +4204,7 @@ ?frexp_impl@detail@_V1@sycl@@YA?AVhalf@half_impl@123@V45123@PEAH@Z ?frexp_impl@detail@_V1@sycl@@YAMMPEAH@Z ?frexp_impl@detail@_V1@sycl@@YANNPEAH@Z +?generateFlushCommand@stream_impl@detail@_V1@sycl@@QEAAXAEAVhandler@34@@Z ?get@context@_V1@sycl@@QEBAPEAU_cl_context@@XZ ?get@device@_V1@sycl@@QEBAPEAU_cl_device_id@@XZ ?get@kernel@_V1@sycl@@QEBAPEAU_cl_kernel@@XZ @@ -4655,6 +4655,7 @@ ?useHostPtr@SYCLMemObjT@detail@_V1@sycl@@QEAA_NXZ ?use_kernel_bundle@handler@_V1@sycl@@QEAAXAEBV?$kernel_bundle@$01@23@@Z ?usesPinnedHostMemory@SYCLMemObjT@detail@_V1@sycl@@UEBA_NXZ +?verifyDeviceHasProgressGuarantee@handler@_V1@sycl@@AEAAXW4forward_progress_guarantee@experimental@oneapi@ext@23@W4execution_scope@56723@1@Z ?verifyKernelInvoc@handler@_V1@sycl@@AEAAXAEBVkernel@23@@Z ?verifyUsedKernelBundle@handler@_V1@sycl@@AEAAXAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z ?verifyUsedKernelBundleInternal@handler@_V1@sycl@@AEAAXVstring_view@detail@23@@Z From 5fbcb1ead2551a055366f906a093c9267ccaf978 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Mon, 24 Jun 2024 05:17:33 -0700 Subject: [PATCH 38/58] fix format Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/stream_impl.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sycl/source/detail/stream_impl.cpp b/sycl/source/detail/stream_impl.cpp index 7e81e964bdc17..1ba09ed36369c 100644 --- a/sycl/source/detail/stream_impl.cpp +++ b/sycl/source/detail/stream_impl.cpp @@ -108,13 +108,13 @@ void stream_impl::generateFlushCommand(handler &cgh) { } // ABI break: remove -void stream_impl::initStreamHost(QueueImplPtr){}; +void stream_impl::initStreamHost(QueueImplPtr){} // ABI break: remove -void stream_impl::flush(const EventImplPtr &) {}; +void stream_impl::flush(const EventImplPtr &) {} // ABI break: remove -void stream_impl::flush() {}; +void stream_impl::flush() {} } // namespace detail } // namespace _V1 From d5d15bf8f4b4317e3a9f43ce179a65f7a195f849 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Mon, 24 Jun 2024 08:28:19 -0700 Subject: [PATCH 39/58] fix formatting Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/stream_impl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/source/detail/stream_impl.cpp b/sycl/source/detail/stream_impl.cpp index 1ba09ed36369c..b9f70581ac7a8 100644 --- a/sycl/source/detail/stream_impl.cpp +++ b/sycl/source/detail/stream_impl.cpp @@ -108,7 +108,7 @@ void stream_impl::generateFlushCommand(handler &cgh) { } // ABI break: remove -void stream_impl::initStreamHost(QueueImplPtr){} +void stream_impl::initStreamHost(QueueImplPtr) {} // ABI break: remove void stream_impl::flush(const EventImplPtr &) {} From e185cbcca90a9d76827c95fe211aace1c7284f95 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Tue, 25 Jun 2024 08:25:30 -0700 Subject: [PATCH 40/58] self review comments fix Signed-off-by: Tikhomirova, Kseniya --- sycl/source/context.cpp | 2 +- sycl/source/detail/buffer_impl.cpp | 4 +- sycl/source/detail/event_impl.cpp | 2 +- sycl/source/detail/event_impl.hpp | 2 +- sycl/source/detail/memory_manager.cpp | 2 +- sycl/source/detail/platform_impl.hpp | 6 -- sycl/source/detail/queue_impl.cpp | 2 +- sycl/source/detail/scheduler/commands.cpp | 84 ++++++++----------- sycl/source/detail/scheduler/scheduler.hpp | 10 +-- sycl/source/device.cpp | 2 +- sycl/source/event.cpp | 2 +- sycl/source/kernel.cpp | 2 +- sycl/source/platform.cpp | 2 +- sycl/source/queue.cpp | 2 +- .../test-e2e/SubGroup/sub_groups_sycl2020.cpp | 4 - 15 files changed, 52 insertions(+), 76 deletions(-) diff --git a/sycl/source/context.cpp b/sycl/source/context.cpp index 1261096b82047..e4c7404c7b078 100644 --- a/sycl/source/context.cpp +++ b/sycl/source/context.cpp @@ -127,7 +127,7 @@ context::get_backend_info() const { cl_context context::get() const { return impl->get(); } bool context::is_host() const { - assert(true && "context::is_host should not be called in implementation."); + assert(false && "context::is_host should not be called in implementation."); return false; } diff --git a/sycl/source/detail/buffer_impl.cpp b/sycl/source/detail/buffer_impl.cpp index f13444107e9eb..1795992594078 100644 --- a/sycl/source/detail/buffer_impl.cpp +++ b/sycl/source/detail/buffer_impl.cpp @@ -24,7 +24,9 @@ void *buffer_impl::allocateMem(ContextImplPtr Context, bool InitFromUserData, sycl::detail::pi::PiEvent &OutEventToWait) { bool HostPtrReadOnly = false; BaseT::determineHostPtr(Context, InitFromUserData, HostPtr, HostPtrReadOnly); - + assert(!(nullptr == HostPtr && BaseT::useHostPtr() && !Context) && + "Internal error. Allocating memory on the host " + "while having use_host_ptr property"); return MemoryManager::allocateMemBuffer( std::move(Context), this, HostPtr, HostPtrReadOnly, BaseT::getSizeInBytes(), BaseT::MInteropEvent, BaseT::MInteropContext, diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index e203924d2d612..f4ad52221ed37 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -367,7 +367,7 @@ uint64_t event_impl::get_profiling_info() { } template <> uint32_t event_impl::get_info() { - if (MEvent) { + if (!MIsHostEvent && MEvent) { return get_event_info(this->getHandleRef(), this->getPlugin()); } diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index 8b46e715cd13e..12b58d25ab3cd 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -49,7 +49,7 @@ class event_impl { /// Normally constructs a host event, use std::nullopt to instead instantiate /// a device event. event_impl(std::optional State = HES_Complete) - : MIsInitialized(false), MIsFlushed(true), + : MIsInitialized(false), MIsHostEvent(State), MIsFlushed(true), MState(State.value_or(HES_Complete)) { // Need to fail in event() constructor if there are problems with the // ONEAPI_DEVICE_SELECTOR. Deferring may lead to conficts with noexcept diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index 6f30ceef8eb51..97615960877ff 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -935,7 +935,7 @@ void MemoryManager::unmap(SYCLMemObjI *, void *Mem, QueueImplPtr Queue, std::vector DepEvents, sycl::detail::pi::PiEvent &OutEvent) { - // Host queue is not supported here. + // Execution on host is not supported here. if (!Queue) { throw runtime_error("Not supported configuration of unmap requested", PI_ERROR_INVALID_OPERATION); diff --git a/sycl/source/detail/platform_impl.hpp b/sycl/source/detail/platform_impl.hpp index bc6278d54f32c..0a926712eb806 100644 --- a/sycl/source/detail/platform_impl.hpp +++ b/sycl/source/detail/platform_impl.hpp @@ -32,9 +32,6 @@ class device_impl; // TODO: implement parameters treatment for host device class platform_impl { public: - /// Constructs platform_impl for a SYCL host platform. - platform_impl() : MHostPlatform(true) {} - /// Constructs platform_impl from a plug-in interoperability platform /// handle. /// @@ -125,7 +122,6 @@ class platform_impl { // \return the Plugin associated with this platform. const PluginPtr &getPlugin() const { - assert(!MHostPlatform && "Plugin is not available for Host."); return MPlugin; } @@ -134,7 +130,6 @@ class platform_impl { /// \param PluginPtr is a pointer to a plugin instance /// \param Backend is the backend that we want this platform to use void setPlugin(PluginPtr &PluginPtr, backend Backend) { - assert(!MHostPlatform && "Plugin is not available for Host"); MPlugin = PluginPtr; MBackend = Backend; } @@ -214,7 +209,6 @@ class platform_impl { filterDeviceFilter(std::vector &PiDevices, ListT *FilterList) const; - bool MHostPlatform = false; sycl::detail::pi::PiPlatform MPlatform = 0; backend MBackend; diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index a5f9ae9964ac6..ae59239664327 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -655,7 +655,7 @@ bool queue_impl::ext_oneapi_empty() const { info::event_command_status::complete; } - // Check the status of the backend queue if this is not a host queue. + // Check the status of the backend queue. pi_bool IsReady = false; getPlugin()->call( MQueues[0], PI_EXT_ONEAPI_QUEUE_INFO_EMPTY, sizeof(pi_bool), &IsReady, diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 6322b904fd6bc..d52fb0da025f3 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -79,7 +79,10 @@ static size_t deviceToID(const device &Device) { return reinterpret_cast(getSyclObjImpl(Device)->getHandleRef()); } -static std::string deviceToString(device Device) { +static std::string queueDeviceToString(const QueueImplPtr &Queue) { + if (!Queue) + return "host"; + auto Device = Queue->get_device(); if (Device.is_cpu()) return "CPU"; else if (Device.is_gpu()) @@ -91,15 +94,19 @@ static std::string deviceToString(device Device) { } static void addDeviceMetadata(xpti_td *TraceEvent, const QueueImplPtr &Queue) { - xpti::addMetadata(TraceEvent, "sycl_device", - Queue ? deviceToID(Queue->get_device()) : 0); - xpti::addMetadata(TraceEvent, "sycl_device_type", - Queue ? deviceToString(Queue->get_device()) : "host"); + xpti::addMetadata(TraceEvent, "sycl_device_type", queueDeviceToString(Queue)); if (Queue) + { + xpti::addMetadata(TraceEvent, "sycl_device", deviceToID(Queue->get_device())); xpti::addMetadata(TraceEvent, "sycl_device_name", getSyclObjImpl(Queue->get_device())->getDeviceName()); + } } +static unsigned long long getQueueID(const QueueImplPtr& Queue) +{ + return Queue ? Queue->getQueueID() : 0; +} #endif static ContextImplPtr getContext(const QueueImplPtr &Queue) { @@ -1009,8 +1016,7 @@ void AllocaCommandBase::emitInstrumentationData() { xpti::addMetadata(TE, "memory_object", reinterpret_cast(MAddress)); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant - xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue ? MQueue->getQueueID() : 0); + xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue)); } #endif } @@ -1081,8 +1087,7 @@ void AllocaCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "ALLOCA ON " - << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; + Stream << "ALLOCA ON " << queueDeviceToString(MQueue) << "\\n"; Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n"; Stream << " Link : " << this->MLinkedAllocaCmd << "\\n"; Stream << "\"];" << std::endl; @@ -1130,8 +1135,7 @@ void AllocaSubBufCommand::emitInstrumentationData() { this->MRequirement.MAccessRange[0]); xpti::addMetadata(TE, "access_range_end", this->MRequirement.MAccessRange[1]); - xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue ? MQueue->getQueueID() : 0); + xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue)); makeTraceEventEpilog(); } #endif @@ -1168,8 +1172,7 @@ void AllocaSubBufCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "ALLOCA SUB BUF ON " - << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; + Stream << "ALLOCA SUB BUF ON " << queueDeviceToString(MQueue)<< "\\n"; Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n"; Stream << " Offset : " << this->MRequirement.MOffsetInBytes << "\\n"; Stream << " Access range : " << this->MRequirement.MAccessRange[0] << "\\n"; @@ -1207,8 +1210,7 @@ void ReleaseCommand::emitInstrumentationData() { commandToName(MAllocaCmd->getType())); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant - xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue ? MQueue->getQueueID() : 0); + xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue)); makeTraceEventEpilog(); } #endif @@ -1282,8 +1284,7 @@ void ReleaseCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FF827A\", label=\""; Stream << "ID = " << this << " ; "; - Stream << "RELEASE ON " - << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; + Stream << "RELEASE ON " << queueDeviceToString(MQueue) << "\\n"; Stream << " Alloca : " << MAllocaCmd << "\\n"; Stream << " MemObj : " << MAllocaCmd->getSYCLMemObj() << "\\n"; Stream << "\"];" << std::endl; @@ -1327,8 +1328,7 @@ void MapMemObject::emitInstrumentationData() { xpti::addMetadata(TE, "memory_object", reinterpret_cast(MAddress)); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant - xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue ? MQueue->getQueueID() : 0); + xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue)); makeTraceEventEpilog(); } #endif @@ -1353,8 +1353,7 @@ void MapMemObject::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#77AFFF\", label=\""; Stream << "ID = " << this << " ; "; - Stream << "MAP ON " - << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; + Stream << "MAP ON " << queueDeviceToString(MQueue) : "host") << "\\n"; Stream << "\"];" << std::endl; @@ -1389,8 +1388,7 @@ void UnMapMemObject::emitInstrumentationData() { xpti::addMetadata(TE, "memory_object", reinterpret_cast(MAddress)); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant - xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue ? MQueue->getQueueID() : 0); + xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue)); makeTraceEventEpilog(); } #endif @@ -1435,8 +1433,7 @@ void UnMapMemObject::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#EBC40F\", label=\""; Stream << "ID = " << this << " ; "; - Stream << "UNMAP ON " - << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; + Stream << "UNMAP ON " << queueDeviceToString(MQueue) << "\\n"; Stream << "\"];" << std::endl; @@ -1488,8 +1485,7 @@ void MemCpyCommand::emitInstrumentationData() { MQueue ? deviceToID(MQueue->get_device()) : 0); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant - xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue ? MQueue->getQueueID() : 0); + xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue)); makeTraceEventEpilog(); } #endif @@ -1546,8 +1542,7 @@ void MemCpyCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#C7EB15\" label=\""; Stream << "ID = " << this << " ; "; - Stream << "MEMCPY ON " - << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; + Stream << "MEMCPY ON " << queueDeviceToString(MQueue) << "\\n"; Stream << "From: " << MSrcAllocaCmd << " is host: " << !MSrcQueue << "\\n"; Stream << "To: " << MDstAllocaCmd << " is host: " << !MQueue << "\\n"; @@ -1603,8 +1598,7 @@ void UpdateHostRequirementCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#f1337f\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "UPDATE REQ ON " - << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; + Stream << "UPDATE REQ ON " << queueDeviceToString(MQueue) << "\\n"; bool IsReqOnBuffer = MDstReq.MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer; Stream << "TYPE: " << (IsReqOnBuffer ? "Buffer" : "Image") << "\\n"; @@ -1661,8 +1655,7 @@ void MemCpyCommandHost::emitInstrumentationData() { MQueue ? deviceToID(MQueue->get_device()) : 0); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant - xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue ? MQueue->getQueueID() : 0); + xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue)); makeTraceEventEpilog(); } #endif @@ -1750,8 +1743,7 @@ void EmptyCommand::emitInstrumentationData() { reinterpret_cast(MAddress)); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant - xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue ? MQueue->getQueueID() : 0); + xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue)); makeTraceEventEpilog(); } #endif @@ -1780,8 +1772,7 @@ void MemCpyCommandHost::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#B6A2EB\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "MEMCPY HOST ON " - << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; + Stream << "MEMCPY HOST ON " << queueDeviceToString(MQueue) << "\\n"; Stream << "\"];" << std::endl; @@ -1819,8 +1810,7 @@ void UpdateHostRequirementCommand::emitInstrumentationData() { reinterpret_cast(MAddress)); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant - xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue ? MQueue->getQueueID() : 0); + xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue)); makeTraceEventEpilog(); } #endif @@ -2087,9 +2077,7 @@ std::pair emitKernelInstrumentationData( if (CmdTraceEvent) { // Stash the queue_id mutable metadata in TLS - xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - Queue ? Queue->getQueueID() : 0); - + xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(Queue)); instrumentationAddExtraKernelMetadata(CmdTraceEvent, NDRDesc, KernelBundleImplPtr, SyclKernelName, SyclKernel, Queue, CGArgs); @@ -2133,8 +2121,7 @@ void ExecCGCommand::emitInstrumentationData() { CmdTraceEvent); if (CmdTraceEvent) { - xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue ? MQueue->getQueueID() : 0); + xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue)); MTraceEvent = static_cast(CmdTraceEvent); if (MCommandGroup->getType() == detail::CG::Kernel) { auto KernelCG = @@ -2157,8 +2144,7 @@ void ExecCGCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "EXEC CG ON " - << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"; + Stream << "EXEC CG ON " << queueDeviceToString(MQueue) << "\\n"; switch (MCommandGroup->getType()) { case detail::CG::Kernel: { @@ -3353,8 +3339,7 @@ void KernelFusionCommand::emitInstrumentationData() { if (MFirstInstance) { // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant - xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue ? MQueue->getQueueID() : 0); + xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue)); xptiNotifySubscribers(MStreamID, NotificationTraceType, detail::GSYCLGraphEvent, static_cast(MTraceEvent), MInstanceID, @@ -3368,8 +3353,7 @@ void KernelFusionCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "KERNEL FUSION on " - << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n" + Stream << "KERNEL FUSION on " << queueDeviceToString(MQueue) << "\\n" << "FUSION LIST: {"; bool Initial = true; for (auto *Cmd : MFusionList) { diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 03372fc0b7a8f..cd5ae6bd0e0fe 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -213,16 +213,16 @@ struct MemObjRecord { // Contains latest write commands working with memory object. LeavesCollection MWriteLeaves; - // The flag indicates that the content of the memory object was/will be - // modified. Used while deciding if copy back needed. - bool MMemModified = false; - // The context which has the latest state of the memory object. ContextImplPtr MCurContext; - // The mode this object can be accessed with from the host (host_accessor). + // The mode this object can be accessed from the host (host_accessor). // Valid only if the current usage is on host. access::mode MHostAccess = access::mode::read_write; + + // The flag indicates that the content of the memory object was/will be + // modified. Used while deciding if copy back needed. + bool MMemModified = false; }; /// DPC++ graph scheduler class. diff --git a/sycl/source/device.cpp b/sycl/source/device.cpp index a3a88ebf6636a..18b9cf4036cda 100644 --- a/sycl/source/device.cpp +++ b/sycl/source/device.cpp @@ -71,7 +71,7 @@ std::vector device::get_devices(info::device_type deviceType) { cl_device_id device::get() const { return impl->get(); } bool device::is_host() const { - assert(true && "device::is_host should not be called in implementation."); + assert(false && "device::is_host should not be called in implementation."); return false; } diff --git a/sycl/source/event.cpp b/sycl/source/event.cpp index 12b4a7e68164e..69d62f354ea4c 100644 --- a/sycl/source/event.cpp +++ b/sycl/source/event.cpp @@ -38,7 +38,7 @@ bool event::operator==(const event &rhs) const { return rhs.impl == impl; } bool event::operator!=(const event &rhs) const { return !(*this == rhs); } bool event::is_host() const { - assert(true && "event::is_host should not be called in implementation."); + assert(false && "event::is_host should not be called in implementation."); return false; } diff --git a/sycl/source/kernel.cpp b/sycl/source/kernel.cpp index bc842f6e596a5..625eb995c47d3 100644 --- a/sycl/source/kernel.cpp +++ b/sycl/source/kernel.cpp @@ -31,7 +31,7 @@ kernel::kernel(cl_kernel ClKernel, const context &SyclContext) cl_kernel kernel::get() const { return impl->get(); } bool kernel::is_host() const { - assert(true && "kernel::is_host should not be called in implementation."); + assert(false && "kernel::is_host should not be called in implementation."); return false; } diff --git a/sycl/source/platform.cpp b/sycl/source/platform.cpp index 9a15943213ec6..179c8c09d0825 100644 --- a/sycl/source/platform.cpp +++ b/sycl/source/platform.cpp @@ -41,7 +41,7 @@ bool platform::has_extension(const std::string &ExtensionName) const { } bool platform::is_host() const { - assert(true && "platform::is_host should not be called in implementation."); + assert(false && "platform::is_host should not be called in implementation."); return false; } diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 174d1f9197af1..5cd0bd3449095 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -96,7 +96,7 @@ queue::ext_oneapi_get_graph() const { } bool queue::is_host() const { - assert(true && "queue::is_host should not be called in implementation."); + assert(false && "queue::is_host should not be called in implementation."); return false; } diff --git a/sycl/test-e2e/SubGroup/sub_groups_sycl2020.cpp b/sycl/test-e2e/SubGroup/sub_groups_sycl2020.cpp index 5b71a60a54051..a7d4c6493b8b5 100644 --- a/sycl/test-e2e/SubGroup/sub_groups_sycl2020.cpp +++ b/sycl/test-e2e/SubGroup/sub_groups_sycl2020.cpp @@ -1,9 +1,5 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// -// Assertion `!MHostPlatform && "Plugin is not available for Host."' failed on -// Nvidia. -// XFAIL: hip_nvidia #include From a87b32817a46d1dfdba9205163106f2af565ea6c Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 26 Jun 2024 04:35:59 -0700 Subject: [PATCH 41/58] fix Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/event_impl.hpp | 4 ++-- sycl/source/detail/scheduler/commands.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index 12b58d25ab3cd..f609bd96b7189 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -49,8 +49,8 @@ class event_impl { /// Normally constructs a host event, use std::nullopt to instead instantiate /// a device event. event_impl(std::optional State = HES_Complete) - : MIsInitialized(false), MIsHostEvent(State), MIsFlushed(true), - MState(State.value_or(HES_Complete)) { + : MIsInitialized(false), MIsFlushed(true), + MState(State.value_or(HES_Complete)), MIsHostEvent(State) { // Need to fail in event() constructor if there are problems with the // ONEAPI_DEVICE_SELECTOR. Deferring may lead to conficts with noexcept // event methods. This ::get() call uses static vars to read and parse the diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index d52fb0da025f3..9d9315652ed55 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -1353,7 +1353,7 @@ void MapMemObject::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#77AFFF\", label=\""; Stream << "ID = " << this << " ; "; - Stream << "MAP ON " << queueDeviceToString(MQueue) : "host") << "\\n"; + Stream << "MAP ON " << queueDeviceToString(MQueue) << "\\n"; Stream << "\"];" << std::endl; From 0a5a7583eef8f597c8b82c70a8671aeb1f45097c Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 26 Jun 2024 07:18:55 -0700 Subject: [PATCH 42/58] Update isCOntextInitialized stuff Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/event_impl.cpp | 27 +++++----- sycl/source/detail/event_impl.hpp | 39 ++++++++------- sycl/source/detail/helpers.cpp | 4 +- sycl/source/detail/scheduler/commands.cpp | 49 ++++++++++++------- sycl/source/detail/scheduler/scheduler.cpp | 4 +- sycl/source/queue.cpp | 2 +- sycl/unittests/buffer/BufferReleaseBase.hpp | 4 -- sycl/unittests/pi/PiMock.cpp | 4 -- .../scheduler/EnqueueWithDependsOnDeps.cpp | 4 -- .../scheduler/InOrderQueueHostTaskDeps.cpp | 4 -- sycl/unittests/scheduler/KernelFusion.cpp | 4 -- 11 files changed, 66 insertions(+), 79 deletions(-) diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index f4ad52221ed37..58a52230f1269 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -33,8 +33,8 @@ extern xpti::trace_event_data_t *GSYCLGraphEvent; #endif // If we do not yet have a context, use the default one. -void event_impl::ensureContextInitialized() { - if (MIsContextInitialized) +void event_impl::tryToInitContext() { + if (MContext || !MIsDefaultConstructed) return; const device SyclDevice; @@ -114,12 +114,12 @@ const sycl::detail::pi::PiEvent &event_impl::getHandleRef() const { sycl::detail::pi::PiEvent &event_impl::getHandleRef() { return MEvent; } const ContextImplPtr &event_impl::getContextImpl() { - ensureContextInitialized(); + tryToInitContext(); return MContext; } const PluginPtr &event_impl::getPlugin() { - ensureContextInitialized(); + tryToInitContext(); return MContext->getPlugin(); } @@ -128,14 +128,12 @@ void event_impl::setStateIncomplete() { MState = HES_NotComplete; } void event_impl::setContextImpl(const ContextImplPtr &Context) { MIsHostEvent = Context == nullptr; MContext = Context; - MIsContextInitialized = true; } event_impl::event_impl(sycl::detail::pi::PiEvent Event, const context &SyclContext) - : MIsContextInitialized(true), MEvent(Event), - MContext(detail::getSyclObjImpl(SyclContext)), MIsFlushed(true), - MState(HES_Complete) { + : MEvent(Event), MContext(detail::getSyclObjImpl(SyclContext)), + MIsFlushed(true), MState(HES_Complete) { sycl::detail::pi::PiContext TempContext; getPlugin()->call( @@ -398,7 +396,7 @@ event_impl::get_info() { template <> typename info::platform::version::return_type event_impl::get_backend_info() const { - if (!MIsContextInitialized) { + if (!MContext) { return "Context not initialized, no backend info available"; } if (MContext->getBackend() != backend::opencl) { @@ -419,7 +417,7 @@ event_impl::get_backend_info() const { template <> typename info::device::version::return_type event_impl::get_backend_info() const { - if (!MIsContextInitialized) { + if (!MContext) { return "Context not initialized, no backend info available"; } if (MContext->getBackend() != backend::opencl) { @@ -437,7 +435,7 @@ event_impl::get_backend_info() const { template <> typename info::device::backend_version::return_type event_impl::get_backend_info() const { - if (!MIsContextInitialized) { + if (!MContext) { return "Context not initialized, no backend info available"; } if (MContext->getBackend() != backend::ext_oneapi_level_zero) { @@ -456,11 +454,12 @@ void HostProfilingInfo::start() { StartTime = getTimestamp(); } void HostProfilingInfo::end() { EndTime = getTimestamp(); } pi_native_handle event_impl::getNative() { - ensureContextInitialized(); + if (isHost()) + return {}; + tryToInitContext(); auto Plugin = getPlugin(); - if (!MIsInitialized) { - MIsInitialized = true; + if (MIsDefaultConstructed && !MEvent) { auto TempContext = MContext.get()->getHandleRef(); Plugin->call(TempContext, &MEvent); } diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index f609bd96b7189..f4c2ac2e90a86 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -49,8 +49,8 @@ class event_impl { /// Normally constructs a host event, use std::nullopt to instead instantiate /// a device event. event_impl(std::optional State = HES_Complete) - : MIsInitialized(false), MIsFlushed(true), - MState(State.value_or(HES_Complete)), MIsHostEvent(State) { + : MIsFlushed(true), MState(State.value_or(HES_Complete)), + MIsDefaultConstructed(!State), MIsHostEvent(State) { // Need to fail in event() constructor if there are problems with the // ONEAPI_DEVICE_SELECTOR. Deferring may lead to conficts with noexcept // event methods. This ::get() call uses static vars to read and parse the @@ -255,15 +255,6 @@ class event_impl { QueueImplPtr getSubmittedQueue() const { return MSubmittedQueue.lock(); }; - /// Checks if an event is in a fully intialized state. Default-constructed - /// events will return true only after having initialized its native event, - /// while other events will assume that they are fully initialized at - /// construction, relying on external sources to supply member data. - /// - /// \return true if the event is considered to be in a fully initialized - /// state. - bool isInitialized() const noexcept { return MIsInitialized; } - /// Checks if this event is complete. /// /// \return true if this event is complete. @@ -279,10 +270,11 @@ class event_impl { MPostCompleteEvents.push_back(Event); } - bool isContextInitialized() const noexcept { return MIsContextInitialized; } + bool isDefaultConstructed() const noexcept { return MIsDefaultConstructed; } ContextImplPtr getContextImplPtr() { - ensureContextInitialized(); + if (MIsDefaultConstructed) + tryToInitContext(); return MContext; } @@ -347,11 +339,7 @@ class event_impl { void instrumentationEpilog(void *TelementryEvent, const std::string &Name, int32_t StreamID, uint64_t IId) const; void checkProfilingPreconditions() const; - // Events constructed without a context will lazily use the default context - // when needed. - void ensureContextInitialized(); - bool MIsInitialized = true; - bool MIsContextInitialized = false; + sycl::detail::pi::PiEvent MEvent = nullptr; // Stores submission time of command associated with event uint64_t MSubmitTime = 0; @@ -409,7 +397,20 @@ class event_impl { std::shared_ptr Context); std::atomic_bool MIsEnqueued{false}; - bool MIsHostEvent{false}; + + // Events constructed without a context will lazily use the default context + // when needed. + void tryToInitContext(); + // Event class represents 3 different kinds of operations: + // | type | has PI event | MContext | MIsHostTask | MIsDefaultConstructed | + // | dev | true | !nullptr | false | false | + // | host | false | nullptr | true | false | + // |default| * | * | false | true | + // Default constructed event is created with empty ctor in host code, MContext + // is lazily initialized with default device context on first context query. + // MEvent is lazily created in first pi handle query. + bool MIsDefaultConstructed = false; + bool MIsHostEvent = false; }; } // namespace detail diff --git a/sycl/source/detail/helpers.cpp b/sycl/source/detail/helpers.cpp index 75c6fd72b8fd0..901fd34b4cce8 100644 --- a/sycl/source/detail/helpers.cpp +++ b/sycl/source/detail/helpers.cpp @@ -31,9 +31,7 @@ getOrWaitEvents(std::vector DepEvents, ContextImplPtr Context) { // throwaway events created with empty constructor will not have a context // (which is set lazily) calling getContextImpl() would set that // context, which we wish to avoid as it is expensive. - if ((!SyclEventImplPtr->isContextInitialized() && - !SyclEventImplPtr->isHost()) || - SyclEventImplPtr->isNOP()) { + if (SyclEventImplPtr->isDefaultConstructed() || SyclEventImplPtr->isNOP()) { continue; } // The fusion command and its event are associated with a non-host context, diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 9d9315652ed55..1b9aea1c10f02 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -95,16 +95,15 @@ static std::string queueDeviceToString(const QueueImplPtr &Queue) { static void addDeviceMetadata(xpti_td *TraceEvent, const QueueImplPtr &Queue) { xpti::addMetadata(TraceEvent, "sycl_device_type", queueDeviceToString(Queue)); - if (Queue) - { - xpti::addMetadata(TraceEvent, "sycl_device", deviceToID(Queue->get_device())); + if (Queue) { + xpti::addMetadata(TraceEvent, "sycl_device", + deviceToID(Queue->get_device())); xpti::addMetadata(TraceEvent, "sycl_device_name", getSyclObjImpl(Queue->get_device())->getDeviceName()); } } -static unsigned long long getQueueID(const QueueImplPtr& Queue) -{ +static unsigned long long getQueueID(const QueueImplPtr &Queue) { return Queue ? Queue->getQueueID() : 0; } #endif @@ -279,7 +278,7 @@ std::vector Command::getPiEventsBlocking( // (which is set lazily) calling getContextImpl() would set that // context, which we wish to avoid as it is expensive. // Skip host task and NOP events also. - if (!EventImpl->isContextInitialized() || EventImpl->isHost() || + if (EventImpl->isDefaultConstructed() || EventImpl->isHost() || EventImpl->isNOP()) continue; // In this path nullptr native event means that the command has not been @@ -728,7 +727,8 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, // 2. Some types of commands do not produce PI events after they are // enqueued (e.g. alloca). Note that we can't check the pi event to make that // distinction since the command might still be unenqueued at this point. - bool PiEventExpected = (!DepEvent->isHost() && DepEvent->isInitialized()); + bool PiEventExpected = + (!DepEvent->isHost() && !DepEvent->isDefaultConstructed()); if (auto *DepCmd = static_cast(DepEvent->getCommand())) PiEventExpected &= DepCmd->producesPiEvent(); @@ -1016,7 +1016,8 @@ void AllocaCommandBase::emitInstrumentationData() { xpti::addMetadata(TE, "memory_object", reinterpret_cast(MAddress)); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant - xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue)); + xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, + getQueueID(MQueue)); } #endif } @@ -1135,7 +1136,8 @@ void AllocaSubBufCommand::emitInstrumentationData() { this->MRequirement.MAccessRange[0]); xpti::addMetadata(TE, "access_range_end", this->MRequirement.MAccessRange[1]); - xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue)); + xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, + getQueueID(MQueue)); makeTraceEventEpilog(); } #endif @@ -1172,7 +1174,7 @@ void AllocaSubBufCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "ALLOCA SUB BUF ON " << queueDeviceToString(MQueue)<< "\\n"; + Stream << "ALLOCA SUB BUF ON " << queueDeviceToString(MQueue) << "\\n"; Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n"; Stream << " Offset : " << this->MRequirement.MOffsetInBytes << "\\n"; Stream << " Access range : " << this->MRequirement.MAccessRange[0] << "\\n"; @@ -1210,7 +1212,8 @@ void ReleaseCommand::emitInstrumentationData() { commandToName(MAllocaCmd->getType())); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant - xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue)); + xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, + getQueueID(MQueue)); makeTraceEventEpilog(); } #endif @@ -1328,7 +1331,8 @@ void MapMemObject::emitInstrumentationData() { xpti::addMetadata(TE, "memory_object", reinterpret_cast(MAddress)); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant - xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue)); + xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, + getQueueID(MQueue)); makeTraceEventEpilog(); } #endif @@ -1388,7 +1392,8 @@ void UnMapMemObject::emitInstrumentationData() { xpti::addMetadata(TE, "memory_object", reinterpret_cast(MAddress)); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant - xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue)); + xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, + getQueueID(MQueue)); makeTraceEventEpilog(); } #endif @@ -1485,7 +1490,8 @@ void MemCpyCommand::emitInstrumentationData() { MQueue ? deviceToID(MQueue->get_device()) : 0); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant - xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue)); + xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, + getQueueID(MQueue)); makeTraceEventEpilog(); } #endif @@ -1655,7 +1661,8 @@ void MemCpyCommandHost::emitInstrumentationData() { MQueue ? deviceToID(MQueue->get_device()) : 0); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant - xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue)); + xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, + getQueueID(MQueue)); makeTraceEventEpilog(); } #endif @@ -1743,7 +1750,8 @@ void EmptyCommand::emitInstrumentationData() { reinterpret_cast(MAddress)); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant - xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue)); + xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, + getQueueID(MQueue)); makeTraceEventEpilog(); } #endif @@ -1810,7 +1818,8 @@ void UpdateHostRequirementCommand::emitInstrumentationData() { reinterpret_cast(MAddress)); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant - xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue)); + xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, + getQueueID(MQueue)); makeTraceEventEpilog(); } #endif @@ -2121,7 +2130,8 @@ void ExecCGCommand::emitInstrumentationData() { CmdTraceEvent); if (CmdTraceEvent) { - xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue)); + xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, + getQueueID(MQueue)); MTraceEvent = static_cast(CmdTraceEvent); if (MCommandGroup->getType() == detail::CG::Kernel) { auto KernelCG = @@ -3339,7 +3349,8 @@ void KernelFusionCommand::emitInstrumentationData() { if (MFirstInstance) { // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant - xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue)); + xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, + getQueueID(MQueue)); xptiNotifySubscribers(MStreamID, NotificationTraceType, detail::GSYCLGraphEvent, static_cast(MTraceEvent), MInstanceID, diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index 905ca889aaf0d..4acc5b6c3a6a4 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -697,9 +697,7 @@ bool CheckEventReadiness(const ContextImplPtr &Context, // don't represent actual dependencies. Calling getContextImpl() would set // their context, which we wish to avoid as it is expensive. // NOP events also don't represent actual dependencies. - if ((!SyclEventImplPtr->isContextInitialized() && - !SyclEventImplPtr->isHost()) || - SyclEventImplPtr->isNOP()) { + if ((SyclEventImplPtr->isDefaultConstructed()) || SyclEventImplPtr->isNOP()) { return true; } if (SyclEventImplPtr->isHost()) { diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 5cd0bd3449095..9c807f90061b5 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -244,7 +244,7 @@ event queue::ext_oneapi_submit_barrier(const std::vector &WaitList, bool AllEventsEmptyOrNop = std::all_of( begin(WaitList), end(WaitList), [&](const event &Event) -> bool { auto EventImpl = detail::getSyclObjImpl(Event); - return !EventImpl->isContextInitialized() || EventImpl->isNOP(); + return EventImpl->isDefaultConstructed() || EventImpl->isNOP(); }); if (is_in_order() && !impl->getCommandGraph() && !impl->MIsProfilingEnabled && AllEventsEmptyOrNop) diff --git a/sycl/unittests/buffer/BufferReleaseBase.hpp b/sycl/unittests/buffer/BufferReleaseBase.hpp index b35d73cb3909c..bfcc4fb8369ed 100644 --- a/sycl/unittests/buffer/BufferReleaseBase.hpp +++ b/sycl/unittests/buffer/BufferReleaseBase.hpp @@ -43,10 +43,6 @@ class BufferDestructionCheckCommon : public ::testing::Test { protected: void SetUp() override { - if (Plt.is_host()) { - std::cout << "Not run due to host-only environment\n"; - GTEST_SKIP(); - } MockSchedulerPtr = new MockScheduler(); sycl::detail::GlobalHandler::instance().attachScheduler( dynamic_cast(MockSchedulerPtr)); diff --git a/sycl/unittests/pi/PiMock.cpp b/sycl/unittests/pi/PiMock.cpp index c7014162f9cf8..02044d9631376 100644 --- a/sycl/unittests/pi/PiMock.cpp +++ b/sycl/unittests/pi/PiMock.cpp @@ -56,10 +56,6 @@ TEST(PiMockTest, ConstructFromQueue) { sycl::unittest::PiMock Mock; queue MockQ{Mock.getPlatform().get_devices()[0]}; queue NormalQ; - if (NormalQ.is_host()) { - std::cerr << "Not run due to host-only environment\n"; - return; - } const auto &NormalPiPlugin = detail::getSyclObjImpl(NormalQ)->getPlugin()->getPiPlugin(); diff --git a/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp b/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp index e1bc8c894f311..08f03420ac54e 100644 --- a/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp +++ b/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp @@ -26,10 +26,6 @@ constexpr auto DisableCleanupName = "SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP"; std::vector> PassedNumEvents; bool CheckTestExecutionRequirements(const platform &plt) { - if (plt.is_host()) { - std::cout << "Not run due to host-only environment\n"; - return false; - } // This test only contains device image for SPIR-V capable devices. if (plt.get_backend() != sycl::backend::opencl && plt.get_backend() != sycl::backend::ext_oneapi_level_zero) { diff --git a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp index 8693ff5e4c52b..929f8735bc85f 100644 --- a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp @@ -130,10 +130,6 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) { customextUSMEnqueueMemset); sycl::platform Plt = Mock.getPlatform(); - if (Plt.is_host()) { - std::cout << "Not run due to host-only environment\n"; - GTEST_SKIP(); - } context Ctx{Plt}; queue InOrderQueue{Ctx, default_selector_v, property::queue::in_order()}; diff --git a/sycl/unittests/scheduler/KernelFusion.cpp b/sycl/unittests/scheduler/KernelFusion.cpp index 8b45c03e37f1f..5a86636b13c09 100644 --- a/sycl/unittests/scheduler/KernelFusion.cpp +++ b/sycl/unittests/scheduler/KernelFusion.cpp @@ -42,10 +42,6 @@ detail::Command *CreateTaskCommand(MockScheduler &MS, } bool CheckTestExecRequirements(const platform &plt) { - if (plt.is_host()) { - std::cout << "Not run due to host-only environment\n"; - return false; - } // This test only contains device image for SPIR-V capable devices. if (plt.get_backend() != sycl::backend::opencl && plt.get_backend() != sycl::backend::ext_oneapi_level_zero) { From 97c4ce548c894ab94b223fd66d1d18f7a97f7d78 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 26 Jun 2024 12:00:51 -0700 Subject: [PATCH 43/58] prepare removal from handler Signed-off-by: Tikhomirova, Kseniya --- sycl/include/sycl/handler.hpp | 69 +++++++++------------------- sycl/source/detail/platform_impl.hpp | 4 +- 2 files changed, 23 insertions(+), 50 deletions(-) diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index a71f5400a813d..19d0c5ac1e85e 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -178,22 +178,22 @@ template -static Arg member_ptr_helper(RetType (Func::*)(Arg) const); +static Arg member_ptr_helper(RetType (Func:: *)(Arg) const); // Non-const version of the above template to match functors whose 'operator()' // is declared w/o the 'const' qualifier. template -static Arg member_ptr_helper(RetType (Func::*)(Arg)); +static Arg member_ptr_helper(RetType (Func:: *)(Arg)); // Version with two arguments to handle the case when kernel_handler is passed // to a lambda template -static Arg1 member_ptr_helper(RetType (Func::*)(Arg1, Arg2) const); +static Arg1 member_ptr_helper(RetType (Func:: *)(Arg1, Arg2) const); // Non-const version of the above template to match functors whose 'operator()' // is declared w/o the 'const' qualifier. template -static Arg1 member_ptr_helper(RetType (Func::*)(Arg1, Arg2)); +static Arg1 member_ptr_helper(RetType (Func:: *)(Arg1, Arg2)); template decltype(member_ptr_helper(&F::operator())) argument_helper(int); @@ -464,8 +464,8 @@ class __SYCL_EXPORT handler { /// Constructs SYCL handler from queue. /// /// \param Queue is a SYCL queue. - /// \param IsHost indicates if this handler is created for SYCL host device. - handler(std::shared_ptr Queue, bool IsHost); + handler(std::shared_ptr Queue, + bool /*ABI Break: to remove */); /// Constructs SYCL handler from the associated queue and the submission's /// primary and secondary queue. @@ -475,10 +475,10 @@ class __SYCL_EXPORT handler { /// \param PrimaryQueue is the primary SYCL queue of the submission. /// \param SecondaryQueue is the secondary SYCL queue of the submission. This /// is null if no secondary queue is associated with the submission. - /// \param IsHost indicates if this handler is created for SYCL host device. handler(std::shared_ptr Queue, std::shared_ptr PrimaryQueue, - std::shared_ptr SecondaryQueue, bool IsHost); + std::shared_ptr SecondaryQueue, + bool /*ABI Break: to remove */); /// Constructs SYCL handler from Graph. /// @@ -609,7 +609,7 @@ class __SYCL_EXPORT handler { ~handler() = default; // TODO: Private and unusued. Remove when ABI break is allowed. - bool is_host() { return MIsHost; } + bool is_host() { return false; } #ifdef __SYCL_DEVICE_ONLY__ // In device compilation accessor isn't inherited from host base classes, so @@ -888,12 +888,6 @@ class __SYCL_EXPORT handler { detail::KernelLambdaHasKernelHandlerArgT::value; - if (IsCallableWithKernelHandler && MIsHost) { - throw sycl::feature_not_supported( - "kernel_handler is not yet supported by host device.", - PI_ERROR_INVALID_OPERATION); - } - KernelType *KernelPtr = ResetHostKernel(KernelFunc); @@ -1042,8 +1036,7 @@ class __SYCL_EXPORT handler { std::enable_if_t<(DimSrc > 0) && (DimDst > 0), bool> copyAccToAccHelper(accessor Src, accessor Dst) { - if (!MIsHost && - IsCopyingRectRegionAvailable(Src.get_range(), Dst.get_range())) + if (IsCopyingRectRegionAvailable(Src.get_range(), Dst.get_range())) return false; range<1> LinearizedRange(Src.size()); @@ -1065,6 +1058,7 @@ class __SYCL_EXPORT handler { /// /// \param Src is a source SYCL accessor. /// \param Dst is a destination SYCL accessor. + // ABI break: to remove whole method template copyAccToAccHelper(accessor Src, accessor Dst) { - if (!MIsHost) - return false; - - single_task<__copyAcc2Acc>( - [=]() { *(Dst.get_pointer()) = *(Src.get_pointer()); }); - return true; + return false; } #ifndef __SYCL_DEVICE_ONLY__ + // ABI break: to remove whole method /// Copies the content of memory object accessed by Src into the memory /// pointed by Dst. /// @@ -1101,6 +1090,7 @@ class __SYCL_EXPORT handler { }); } + // ABI break: to remove whole method /// Copies 1 element accessed by 0-dimensional accessor Src into the memory /// pointed by Dst. /// @@ -1118,6 +1108,7 @@ class __SYCL_EXPORT handler { }); } + // ABI break: to remove whole method /// Copies the memory pointed by Src into the memory accessed by Dst. /// /// \param Src is a pointer to source memory. @@ -1135,6 +1126,7 @@ class __SYCL_EXPORT handler { }); } + // ABI break: to remove whole method /// Copies 1 element pointed by Src to memory accessed by 0-dimensional /// accessor Dst. /// @@ -2245,7 +2237,7 @@ class __SYCL_EXPORT handler { MNDRDesc.set(range<1>{1}); MKernel = detail::getSyclObjImpl(std::move(Kernel)); setType(detail::CG::Kernel); - if (!MIsHost && !lambdaAndKernelHaveEqualName()) { + if (!lambdaAndKernelHaveEqualName()) { extractArgsAndReqs(); MKernelName = getKernelName(); } else @@ -2282,7 +2274,7 @@ class __SYCL_EXPORT handler { MKernel = detail::getSyclObjImpl(std::move(Kernel)); setType(detail::CG::Kernel); setNDRangeUsed(false); - if (!MIsHost && !lambdaAndKernelHaveEqualName()) { + if (!lambdaAndKernelHaveEqualName()) { extractArgsAndReqs(); MKernelName = getKernelName(); } else @@ -2322,7 +2314,7 @@ class __SYCL_EXPORT handler { MKernel = detail::getSyclObjImpl(std::move(Kernel)); setType(detail::CG::Kernel); setNDRangeUsed(false); - if (!MIsHost && !lambdaAndKernelHaveEqualName()) { + if (!lambdaAndKernelHaveEqualName()) { extractArgsAndReqs(); MKernelName = getKernelName(); } else @@ -2361,7 +2353,7 @@ class __SYCL_EXPORT handler { MKernel = detail::getSyclObjImpl(std::move(Kernel)); setType(detail::CG::Kernel); setNDRangeUsed(true); - if (!MIsHost && !lambdaAndKernelHaveEqualName()) { + if (!lambdaAndKernelHaveEqualName()) { extractArgsAndReqs(); MKernelName = getKernelName(); } else @@ -2688,14 +2680,6 @@ class __SYCL_EXPORT handler { "Invalid accessor target for the copy method."); static_assert(isValidModeForSourceAccessor(AccessMode), "Invalid accessor mode for the copy method."); -#ifndef __SYCL_DEVICE_ONLY__ - if (MIsHost) { - // TODO: Temporary implementation for host. Should be handled by memory - // manager. - copyAccToPtrHost(Src, Dst); - return; - } -#endif setType(detail::CG::CopyAccToPtr); detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Src; @@ -2732,14 +2716,7 @@ class __SYCL_EXPORT handler { "Invalid accessor mode for the copy method."); // TODO: Add static_assert with is_device_copyable when vec is // device-copyable. -#ifndef __SYCL_DEVICE_ONLY__ - if (MIsHost) { - // TODO: Temporary implementation for host. Should be handled by memory - // manager. - copyPtrToAccHost(Src, Dst); - return; - } -#endif + setType(detail::CG::CopyPtrToAcc); detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Dst; @@ -2853,8 +2830,6 @@ class __SYCL_EXPORT handler { fill(accessor Dst, const T &Pattern) { - assert(!MIsHost && "fill() should no longer be callable on a host device."); - if (Dst.is_placeholder()) checkIfPlaceholderIsBoundToHandler(Dst); @@ -3392,7 +3367,7 @@ class __SYCL_EXPORT handler { /// Storage for the CG created when handling graph nodes added explicitly. std::unique_ptr MGraphNodeCG; - bool MIsHost = false; + bool MIsHost = false; // ABI break: to remove detail::code_location MCodeLoc = {}; bool MIsFinalized = false; diff --git a/sycl/source/detail/platform_impl.hpp b/sycl/source/detail/platform_impl.hpp index 0a926712eb806..dfb2597bf417b 100644 --- a/sycl/source/detail/platform_impl.hpp +++ b/sycl/source/detail/platform_impl.hpp @@ -121,9 +121,7 @@ class platform_impl { static std::vector get_platforms(); // \return the Plugin associated with this platform. - const PluginPtr &getPlugin() const { - return MPlugin; - } + const PluginPtr &getPlugin() const { return MPlugin; } /// Sets the platform implementation to use another plugin. /// From 6cf3171d7d43021fd668789e5b83d12331d41858 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 26 Jun 2024 12:05:12 -0700 Subject: [PATCH 44/58] fix test Signed-off-by: Tikhomirova, Kseniya --- sycl/test-e2e/Config/allowlist.cpp | 58 +++++++++++++----------------- 1 file changed, 24 insertions(+), 34 deletions(-) diff --git a/sycl/test-e2e/Config/allowlist.cpp b/sycl/test-e2e/Config/allowlist.cpp index 121e911c0474c..7bfb16ca687d0 100644 --- a/sycl/test-e2e/Config/allowlist.cpp +++ b/sycl/test-e2e/Config/allowlist.cpp @@ -35,61 +35,51 @@ int main() { // Expected that the allowlist filter is not set if (getenv("PRINT_PLATFORM_INFO")) { for (const sycl::platform &Platform : sycl::platform::get_platforms()) - if (!Platform.is_host()) { + std::string Name = Platform.get_info(); + std::string Ver = Platform.get_info(); + // As a string will be used as regexp pattern, we need to get rid of + // symbols that can be treated in a special way. + replaceSpecialCharacters(Name); + replaceSpecialCharacters(Ver); - std::string Name = Platform.get_info(); - std::string Ver = Platform.get_info(); - // As a string will be used as regexp pattern, we need to get rid of - // symbols that can be treated in a special way. - replaceSpecialCharacters(Name); - replaceSpecialCharacters(Ver); + std::cout << "SYCL_DEVICE_ALLOWLIST=PlatformName:{{" << Name + << "}},PlatformVersion:{{" << Ver << "}}"; - std::cout << "SYCL_DEVICE_ALLOWLIST=PlatformName:{{" << Name - << "}},PlatformVersion:{{" << Ver << "}}"; - - return 0; - } - throw std::runtime_error("Non host device is not found"); + return 0; } // Expected that the allowlist filter is not set if (getenv("PRINT_DEVICE_INFO")) { for (const sycl::platform &Platform : sycl::platform::get_platforms()) - if (!Platform.is_host()) { - const sycl::device Dev = Platform.get_devices().at(0); - std::string Name = Dev.get_info(); - std::string Ver = Dev.get_info(); + const sycl::device Dev = Platform.get_devices().at(0); + std::string Name = Dev.get_info(); + std::string Ver = Dev.get_info(); - // As a string will be used as regexp pattern, we need to get rid of - // symbols that can be treated in a special way. - replaceSpecialCharacters(Name); - replaceSpecialCharacters(Ver); + // As a string will be used as regexp pattern, we need to get rid of + // symbols that can be treated in a special way. + replaceSpecialCharacters(Name); + replaceSpecialCharacters(Ver); - std::cout << "SYCL_DEVICE_ALLOWLIST=DeviceName:{{" << Name - << "}},DriverVersion:{{" << Ver << "}}"; + std::cout << "SYCL_DEVICE_ALLOWLIST=DeviceName:{{" << Name + << "}},DriverVersion:{{" << Ver << "}}"; - return 0; - } - throw std::runtime_error("Non host device is not found"); + return 0; } // Expected the allowlist to be set with the "PRINT_DEVICE_INFO" run result if (getenv("TEST_DEVICE_AVAILABLE")) { for (const sycl::platform &Platform : sycl::platform::get_platforms()) - if (!Platform.is_host()) { - if (Platform.get_devices().size() != 1) - throw std::runtime_error("Expected only one non host device."); + if (Platform.get_devices().size() != 1) + throw std::runtime_error("Expected only one device."); - return 0; - } - throw std::runtime_error("Non host device is not found"); + return 0; + } } // Expected the allowlist to be set but empty if (getenv("TEST_DEVICE_IS_NOT_AVAILABLE")) { for (const sycl::platform &Platform : sycl::platform::get_platforms()) - if (!Platform.is_host()) - throw std::runtime_error("Expected no non host device is available"); + throw std::runtime_error("Expected no device is available"); return 0; } From 989557abba027be8a90c106ac69bac046016565d Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Wed, 26 Jun 2024 12:22:56 -0700 Subject: [PATCH 45/58] fix clang-format Signed-off-by: Tikhomirova, Kseniya --- sycl/include/sycl/handler.hpp | 8 +++--- sycl/test-e2e/Config/allowlist.cpp | 40 +++++++++++++++--------------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 19d0c5ac1e85e..6df476e2d2d96 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -178,22 +178,22 @@ template -static Arg member_ptr_helper(RetType (Func:: *)(Arg) const); +static Arg member_ptr_helper(RetType (Func::*)(Arg) const); // Non-const version of the above template to match functors whose 'operator()' // is declared w/o the 'const' qualifier. template -static Arg member_ptr_helper(RetType (Func:: *)(Arg)); +static Arg member_ptr_helper(RetType (Func::*)(Arg)); // Version with two arguments to handle the case when kernel_handler is passed // to a lambda template -static Arg1 member_ptr_helper(RetType (Func:: *)(Arg1, Arg2) const); +static Arg1 member_ptr_helper(RetType (Func::*)(Arg1, Arg2) const); // Non-const version of the above template to match functors whose 'operator()' // is declared w/o the 'const' qualifier. template -static Arg1 member_ptr_helper(RetType (Func:: *)(Arg1, Arg2)); +static Arg1 member_ptr_helper(RetType (Func::*)(Arg1, Arg2)); template decltype(member_ptr_helper(&F::operator())) argument_helper(int); diff --git a/sycl/test-e2e/Config/allowlist.cpp b/sycl/test-e2e/Config/allowlist.cpp index 7bfb16ca687d0..7891088db5abb 100644 --- a/sycl/test-e2e/Config/allowlist.cpp +++ b/sycl/test-e2e/Config/allowlist.cpp @@ -36,34 +36,34 @@ int main() { if (getenv("PRINT_PLATFORM_INFO")) { for (const sycl::platform &Platform : sycl::platform::get_platforms()) std::string Name = Platform.get_info(); - std::string Ver = Platform.get_info(); - // As a string will be used as regexp pattern, we need to get rid of - // symbols that can be treated in a special way. - replaceSpecialCharacters(Name); - replaceSpecialCharacters(Ver); + std::string Ver = Platform.get_info(); + // As a string will be used as regexp pattern, we need to get rid of + // symbols that can be treated in a special way. + replaceSpecialCharacters(Name); + replaceSpecialCharacters(Ver); - std::cout << "SYCL_DEVICE_ALLOWLIST=PlatformName:{{" << Name - << "}},PlatformVersion:{{" << Ver << "}}"; + std::cout << "SYCL_DEVICE_ALLOWLIST=PlatformName:{{" << Name + << "}},PlatformVersion:{{" << Ver << "}}"; - return 0; + return 0; } // Expected that the allowlist filter is not set if (getenv("PRINT_DEVICE_INFO")) { for (const sycl::platform &Platform : sycl::platform::get_platforms()) const sycl::device Dev = Platform.get_devices().at(0); - std::string Name = Dev.get_info(); - std::string Ver = Dev.get_info(); + std::string Name = Dev.get_info(); + std::string Ver = Dev.get_info(); - // As a string will be used as regexp pattern, we need to get rid of - // symbols that can be treated in a special way. - replaceSpecialCharacters(Name); - replaceSpecialCharacters(Ver); + // As a string will be used as regexp pattern, we need to get rid of + // symbols that can be treated in a special way. + replaceSpecialCharacters(Name); + replaceSpecialCharacters(Ver); - std::cout << "SYCL_DEVICE_ALLOWLIST=DeviceName:{{" << Name - << "}},DriverVersion:{{" << Ver << "}}"; + std::cout << "SYCL_DEVICE_ALLOWLIST=DeviceName:{{" << Name + << "}},DriverVersion:{{" << Ver << "}}"; - return 0; + return 0; } // Expected the allowlist to be set with the "PRINT_DEVICE_INFO" run result @@ -72,14 +72,14 @@ int main() { if (Platform.get_devices().size() != 1) throw std::runtime_error("Expected only one device."); - return 0; - } + return 0; + } } // Expected the allowlist to be set but empty if (getenv("TEST_DEVICE_IS_NOT_AVAILABLE")) { for (const sycl::platform &Platform : sycl::platform::get_platforms()) - throw std::runtime_error("Expected no device is available"); + throw std::runtime_error("Expected no device is available"); return 0; } From 1a139752d02529ac27903be31b1e772e994aeb34 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Thu, 27 Jun 2024 03:41:00 -0700 Subject: [PATCH 46/58] fix warning Signed-off-by: Tikhomirova, Kseniya --- sycl/include/sycl/handler.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 6df476e2d2d96..a536d41f329e0 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -1064,8 +1064,8 @@ class __SYCL_EXPORT handler { access::mode ModeDst, access::target TargetDst, access::placeholder IsPHSrc, access::placeholder IsPHDst> std::enable_if_t - copyAccToAccHelper(accessor Src, - accessor Dst) { + copyAccToAccHelper(accessor, + accessor) { return false; } From e9fffb6419638e729ca7a9da32bd054b50a1dc37 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Thu, 27 Jun 2024 03:48:10 -0700 Subject: [PATCH 47/58] fix allowlist test cherry-pick issues Signed-off-by: Tikhomirova, Kseniya --- sycl/test-e2e/Config/allowlist.cpp | 49 ++++++++++++++++-------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/sycl/test-e2e/Config/allowlist.cpp b/sycl/test-e2e/Config/allowlist.cpp index 7891088db5abb..393326cb76283 100644 --- a/sycl/test-e2e/Config/allowlist.cpp +++ b/sycl/test-e2e/Config/allowlist.cpp @@ -34,46 +34,51 @@ int main() { // Expected that the allowlist filter is not set if (getenv("PRINT_PLATFORM_INFO")) { - for (const sycl::platform &Platform : sycl::platform::get_platforms()) + for (const sycl::platform &Platform : sycl::platform::get_platforms()) { std::string Name = Platform.get_info(); - std::string Ver = Platform.get_info(); - // As a string will be used as regexp pattern, we need to get rid of - // symbols that can be treated in a special way. - replaceSpecialCharacters(Name); - replaceSpecialCharacters(Ver); + std::string Ver = Platform.get_info(); + // As a string will be used as regexp pattern, we need to get rid of + // symbols that can be treated in a special way. + replaceSpecialCharacters(Name); + replaceSpecialCharacters(Ver); - std::cout << "SYCL_DEVICE_ALLOWLIST=PlatformName:{{" << Name - << "}},PlatformVersion:{{" << Ver << "}}"; + std::cout << "SYCL_DEVICE_ALLOWLIST=PlatformName:{{" << Name + << "}},PlatformVersion:{{" << Ver << "}}"; - return 0; + return 0; + } + throw std::runtime_error("No device is found"); } // Expected that the allowlist filter is not set if (getenv("PRINT_DEVICE_INFO")) { - for (const sycl::platform &Platform : sycl::platform::get_platforms()) + for (const sycl::platform &Platform : sycl::platform::get_platforms()) { const sycl::device Dev = Platform.get_devices().at(0); - std::string Name = Dev.get_info(); - std::string Ver = Dev.get_info(); + std::string Name = Dev.get_info(); + std::string Ver = Dev.get_info(); - // As a string will be used as regexp pattern, we need to get rid of - // symbols that can be treated in a special way. - replaceSpecialCharacters(Name); - replaceSpecialCharacters(Ver); + // As a string will be used as regexp pattern, we need to get rid of + // symbols that can be treated in a special way. + replaceSpecialCharacters(Name); + replaceSpecialCharacters(Ver); - std::cout << "SYCL_DEVICE_ALLOWLIST=DeviceName:{{" << Name - << "}},DriverVersion:{{" << Ver << "}}"; + std::cout << "SYCL_DEVICE_ALLOWLIST=DeviceName:{{" << Name + << "}},DriverVersion:{{" << Ver << "}}"; - return 0; + return 0; + } + throw std::runtime_error("No device is found"); } // Expected the allowlist to be set with the "PRINT_DEVICE_INFO" run result if (getenv("TEST_DEVICE_AVAILABLE")) { - for (const sycl::platform &Platform : sycl::platform::get_platforms()) + for (const sycl::platform &Platform : sycl::platform::get_platforms()) { if (Platform.get_devices().size() != 1) throw std::runtime_error("Expected only one device."); - return 0; - } + return 0; + } + throw std::runtime_error("No device is found"); } // Expected the allowlist to be set but empty From 6ec2b63ecaedf8476d8a7dab3ce1bcc7b6e5963d Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Mon, 1 Jul 2024 05:06:17 -0700 Subject: [PATCH 48/58] fix code review comments Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/queue_impl.cpp | 14 +---- sycl/source/detail/scheduler/commands.cpp | 60 +++++++------------ .../source/detail/scheduler/graph_builder.cpp | 4 +- sycl/source/detail/scheduler/scheduler.cpp | 2 +- sycl/source/detail/xpti_registry.cpp | 15 +++++ sycl/source/detail/xpti_registry.hpp | 3 + sycl/test-e2e/Config/allowlist.cpp | 2 +- 7 files changed, 47 insertions(+), 53 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 0ec8f57abb596..6f6e72fbd2af9 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -26,7 +26,7 @@ namespace sycl { inline namespace _V1 { namespace detail { -// Treat 0 as reserved for "host" queue +// Treat 0 as reserved for host task traces std::atomic queue_impl::MNextAvailableQueueID = 1; thread_local bool NestedCallsDetector = false; @@ -498,17 +498,7 @@ void *queue_impl::instrumentationProlog(const detail::code_location &CodeLoc, xpti_at::active, &QWaitInstanceNo); IId = QWaitInstanceNo; if (WaitEvent) { - device D = get_device(); - std::string DevStr; - if (D.is_cpu()) - DevStr = "CPU"; - else if (D.is_gpu()) - DevStr = "GPU"; - else if (D.is_accelerator()) - DevStr = "ACCELERATOR"; - else - DevStr = "UNKNOWN"; - xpti::addMetadata(WaitEvent, "sycl_device_type", DevStr); + xpti::addMetadata(WaitEvent, "sycl_device_type", queueDeviceToString(this)); if (HasSourceInfo) { xpti::addMetadata(WaitEvent, "sym_function_name", CodeLoc.functionName()); xpti::addMetadata(WaitEvent, "sym_source_file_name", CodeLoc.fileName()); diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 2ab4663c5db20..9ea45424f0ce5 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -79,22 +79,8 @@ static size_t deviceToID(const device &Device) { return reinterpret_cast(getSyclObjImpl(Device)->getHandleRef()); } -static std::string queueDeviceToString(const QueueImplPtr &Queue) { - if (!Queue) - return "host"; - auto Device = Queue->get_device(); - if (Device.is_cpu()) - return "CPU"; - else if (Device.is_gpu()) - return "GPU"; - else if (Device.is_accelerator()) - return "ACCELERATOR"; - else - return "UNKNOWN"; -} - static void addDeviceMetadata(xpti_td *TraceEvent, const QueueImplPtr &Queue) { - xpti::addMetadata(TraceEvent, "sycl_device_type", queueDeviceToString(Queue)); + xpti::addMetadata(TraceEvent, "sycl_device_type", queueDeviceToString(Queue.get())); if (Queue) { xpti::addMetadata(TraceEvent, "sycl_device", deviceToID(Queue->get_device())); @@ -411,7 +397,7 @@ class DispatchHostTask { // we're ready to call the user-defined lambda now if (HostTask.MHostTask->isInteropTask()) { assert(HostTask.MQueue && - "Submitted queue for host task must be device queue"); + "Host task submissions should have an associated queue"); interop_handle IH{MReqToMem, HostTask.MQueue, HostTask.MQueue->getDeviceImplPtr(), HostTask.MQueue->getContextImplPtr()}; @@ -1088,7 +1074,7 @@ void AllocaCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "ALLOCA ON " << queueDeviceToString(MQueue) << "\\n"; + Stream << "ALLOCA ON " << queueDeviceToString(MQueue.get()) << "\\n"; Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n"; Stream << " Link : " << this->MLinkedAllocaCmd << "\\n"; Stream << "\"];" << std::endl; @@ -1174,7 +1160,7 @@ void AllocaSubBufCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "ALLOCA SUB BUF ON " << queueDeviceToString(MQueue) << "\\n"; + Stream << "ALLOCA SUB BUF ON " << queueDeviceToString(MQueue.get()) << "\\n"; Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n"; Stream << " Offset : " << this->MRequirement.MOffsetInBytes << "\\n"; Stream << " Access range : " << this->MRequirement.MAccessRange[0] << "\\n"; @@ -1287,7 +1273,7 @@ void ReleaseCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FF827A\", label=\""; Stream << "ID = " << this << " ; "; - Stream << "RELEASE ON " << queueDeviceToString(MQueue) << "\\n"; + Stream << "RELEASE ON " << queueDeviceToString(MQueue.get()) << "\\n"; Stream << " Alloca : " << MAllocaCmd << "\\n"; Stream << " MemObj : " << MAllocaCmd->getSYCLMemObj() << "\\n"; Stream << "\"];" << std::endl; @@ -1357,7 +1343,7 @@ void MapMemObject::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#77AFFF\", label=\""; Stream << "ID = " << this << " ; "; - Stream << "MAP ON " << queueDeviceToString(MQueue) << "\\n"; + Stream << "MAP ON " << queueDeviceToString(MQueue.get()) << "\\n"; Stream << "\"];" << std::endl; @@ -1438,7 +1424,7 @@ void UnMapMemObject::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#EBC40F\", label=\""; Stream << "ID = " << this << " ; "; - Stream << "UNMAP ON " << queueDeviceToString(MQueue) << "\\n"; + Stream << "UNMAP ON " << queueDeviceToString(MQueue.get()) << "\\n"; Stream << "\"];" << std::endl; @@ -1548,7 +1534,7 @@ void MemCpyCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#C7EB15\" label=\""; Stream << "ID = " << this << " ; "; - Stream << "MEMCPY ON " << queueDeviceToString(MQueue) << "\\n"; + Stream << "MEMCPY ON " << queueDeviceToString(MQueue.get()) << "\\n"; Stream << "From: " << MSrcAllocaCmd << " is host: " << !MSrcQueue << "\\n"; Stream << "To: " << MDstAllocaCmd << " is host: " << !MQueue << "\\n"; @@ -1604,7 +1590,7 @@ void UpdateHostRequirementCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#f1337f\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "UPDATE REQ ON " << queueDeviceToString(MQueue) << "\\n"; + Stream << "UPDATE REQ ON " << queueDeviceToString(MQueue.get()) << "\\n"; bool IsReqOnBuffer = MDstReq.MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer; Stream << "TYPE: " << (IsReqOnBuffer ? "Buffer" : "Image") << "\\n"; @@ -1780,7 +1766,7 @@ void MemCpyCommandHost::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#B6A2EB\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "MEMCPY HOST ON " << queueDeviceToString(MQueue) << "\\n"; + Stream << "MEMCPY HOST ON " << queueDeviceToString(MQueue.get()) << "\\n"; Stream << "\"];" << std::endl; @@ -1971,7 +1957,7 @@ void instrumentationAddExtraKernelMetadata( if (!SyclKernel->isCreatedFromSource()) EliminatedArgMask = SyclKernel->getKernelArgMask(); } else { - assert(Queue && "Queue with submitted kernel could not be on host"); + assert(Queue && "Kernel submissions should have an associated queue"); std::tie(Kernel, KernelMutex, EliminatedArgMask, Program) = detail::ProgramManager::getInstance().getOrCreateKernel( Queue->getContextImplPtr(), Queue->getDeviceImplPtr(), KernelName); @@ -2154,7 +2140,7 @@ void ExecCGCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "EXEC CG ON " << queueDeviceToString(MQueue) << "\\n"; + Stream << "EXEC CG ON " << queueDeviceToString(MQueue.get()) << "\\n"; switch (MCommandGroup->getType()) { case detail::CG::Kernel: { @@ -2345,7 +2331,7 @@ static pi_result SetKernelParamsAndLaunch( const KernelArgMask *EliminatedArgMask, const std::function &getMemAllocationFunc, bool IsCooperative) { - assert(Queue && "Queue with submitted kernel could not be on host"); + assert(Queue && "Kernel submissions should have an associated queue"); const PluginPtr &Plugin = Queue->getPlugin(); auto setFunc = [&Plugin, Kernel, &DeviceImageImpl, &getMemAllocationFunc, @@ -2536,7 +2522,7 @@ pi_int32 enqueueImpKernel( const std::function &getMemAllocationFunc, sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig, const bool KernelIsCooperative) { - assert(Queue && "Queue with submitted kernel could not be on host"); + assert(Queue && "Kernel submissions should have an associated queue"); // Run OpenCL kernel auto ContextImpl = Queue->getContextImplPtr(); auto DeviceImpl = Queue->getDeviceImplPtr(); @@ -2652,7 +2638,7 @@ enqueueReadWriteHostPipe(const QueueImplPtr &Queue, const std::string &PipeName, std::vector &RawEvents, const detail::EventImplPtr &OutEventImpl, bool read) { assert(Queue && - "Queue with submitted read write host pipe could not be on host"); + "ReadWrite host pipe submissions should have an associated queue"); detail::HostPipeMapEntry *hostPipeEntry = ProgramManager::getInstance().getHostPipeEntry(PipeName); @@ -2702,7 +2688,7 @@ enqueueReadWriteHostPipe(const QueueImplPtr &Queue, const std::string &PipeName, } pi_int32 ExecCGCommand::enqueueImpCommandBuffer() { - assert(MQueue && "Device queue is required for command buffer enqueue"); + assert(MQueue && "Command buffer enqueue should have an associated queue"); // Wait on host command dependencies waitForPreparedHostEvents(); @@ -2941,7 +2927,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { return PI_SUCCESS; } case CG::CGTYPE::Kernel: { - assert(MQueue && "Device queue must be present for kernel command"); + assert(MQueue && "Kernel submissions should have an associated queue"); CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get(); NDRDescT &NDRDesc = ExecKernel->MNDRDesc; @@ -3094,7 +3080,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { return PI_SUCCESS; } case CG::CGTYPE::Barrier: { - assert(MQueue && "Device queue must be present for barrier command"); + assert(MQueue && "Barrier submission should have an associated queue"); const PluginPtr &Plugin = MQueue->getPlugin(); if (MEvent != nullptr) MEvent->setHostEnqueueTime(); @@ -3105,7 +3091,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { } case CG::CGTYPE::BarrierWaitlist: { assert(MQueue && - "Device queue must be present for barrier with wait list command"); + "Barrier submission should have an associated queue"); CGBarrier *Barrier = static_cast(MCommandGroup.get()); std::vector Events = Barrier->MEventsWaitWithBarrier; std::vector PiEvents = @@ -3173,7 +3159,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { typeSize, RawEvents, EventImpl, read); } case CG::CGTYPE::ExecCommandBuffer: { - assert(MQueue && "Device queue must be present for command buffer enqueue"); + assert(MQueue && "Command buffer submissions should have an associated queue"); CGExecCommandBuffer *CmdBufferCG = static_cast(MCommandGroup.get()); if (MEvent != nullptr) @@ -3197,7 +3183,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { return PI_SUCCESS; } case CG::CGTYPE::SemaphoreWait: { - assert(MQueue && "Device queue must be present for semaphore wait command"); + assert(MQueue && "Semaphore wait submissions should have an associated queue"); CGSemaphoreWait *SemWait = (CGSemaphoreWait *)MCommandGroup.get(); const detail::PluginPtr &Plugin = MQueue->getPlugin(); @@ -3211,7 +3197,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { } case CG::CGTYPE::SemaphoreSignal: { assert(MQueue && - "Device queue must be present for semaphore signal command"); + "Semaphore signal submissions should have an associated queue"); CGSemaphoreSignal *SemSignal = (CGSemaphoreSignal *)MCommandGroup.get(); const detail::PluginPtr &Plugin = MQueue->getPlugin(); @@ -3349,7 +3335,7 @@ void KernelFusionCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "KERNEL FUSION on " << queueDeviceToString(MQueue) << "\\n" + Stream << "KERNEL FUSION on " << queueDeviceToString(MQueue.get()) << "\\n" << "FUSION LIST: {"; bool Initial = true; for (auto *Cmd : MFusionList) { diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 7cfc0446fdd69..284985b2f9c16 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -678,7 +678,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::findAllocaForReq( static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx) { if (const char *HUMConfig = SYCLConfig::get()) { if (std::strcmp(HUMConfig, "0") == 0) - return false; + return Ctx == nullptr; if (std::strcmp(HUMConfig, "1") == 0) return true; } @@ -768,7 +768,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( // new one. There could be situations when we could setup link with // "not" current allocation, but it will require memory copy. // Can setup link between cl and host allocations only - if ((Context != nullptr) != (Record->MCurContext != nullptr)) { + if ((Context == nullptr) != (Record->MCurContext == nullptr)) { // Linked commands assume that the host allocation is reused by the // plugin runtime and that can lead to unnecessary copy overhead on // devices that do not support host unified memory. Do not link the diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index 4acc5b6c3a6a4..a14af63b1a2a0 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -697,7 +697,7 @@ bool CheckEventReadiness(const ContextImplPtr &Context, // don't represent actual dependencies. Calling getContextImpl() would set // their context, which we wish to avoid as it is expensive. // NOP events also don't represent actual dependencies. - if ((SyclEventImplPtr->isDefaultConstructed()) || SyclEventImplPtr->isNOP()) { + if (SyclEventImplPtr->isDefaultConstructed() || SyclEventImplPtr->isNOP()) { return true; } if (SyclEventImplPtr->isHost()) { diff --git a/sycl/source/detail/xpti_registry.cpp b/sycl/source/detail/xpti_registry.cpp index c08e620b0583d..ed629b39b9be0 100644 --- a/sycl/source/detail/xpti_registry.cpp +++ b/sycl/source/detail/xpti_registry.cpp @@ -8,6 +8,7 @@ #include #include +#include #ifdef XPTI_ENABLE_INSTRUMENTATION #include "xpti/xpti_trace_framework.hpp" @@ -362,6 +363,20 @@ void XPTIRegistry::sampledImageHostAccessorNotification( #endif } +std::string queueDeviceToString(const queue_impl* const &Queue) { + if (!Queue) + return "HOST"; + auto Device = Queue->get_device(); + if (Device.is_cpu()) + return "CPU"; + else if (Device.is_gpu()) + return "GPU"; + else if (Device.is_accelerator()) + return "ACCELERATOR"; + else + return "UNKNOWN"; +} + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/xpti_registry.hpp b/sycl/source/detail/xpti_registry.hpp index 681e2841c027b..a66ac46a0cd34 100644 --- a/sycl/source/detail/xpti_registry.hpp +++ b/sycl/source/detail/xpti_registry.hpp @@ -319,6 +319,9 @@ class XPTIScope { }; // class XPTIScope #endif +class queue_impl; +std::string queueDeviceToString(const detail::queue_impl* const &Queue); + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/test-e2e/Config/allowlist.cpp b/sycl/test-e2e/Config/allowlist.cpp index 393326cb76283..063ebabc1aba5 100644 --- a/sycl/test-e2e/Config/allowlist.cpp +++ b/sycl/test-e2e/Config/allowlist.cpp @@ -83,7 +83,7 @@ int main() { // Expected the allowlist to be set but empty if (getenv("TEST_DEVICE_IS_NOT_AVAILABLE")) { - for (const sycl::platform &Platform : sycl::platform::get_platforms()) + if (!sycl::platform::get_platforms().empty()) throw std::runtime_error("Expected no device is available"); return 0; } From 954ba8b77e99d017fdaac40417b75da7419a0d11 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Mon, 1 Jul 2024 05:22:06 -0700 Subject: [PATCH 49/58] extra code review changes Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/event_impl.cpp | 8 ++++---- sycl/source/detail/event_impl.hpp | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index 58a52230f1269..85afb56fcaf9b 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -33,7 +33,7 @@ extern xpti::trace_event_data_t *GSYCLGraphEvent; #endif // If we do not yet have a context, use the default one. -void event_impl::tryToInitContext() { +void event_impl::initContextIfNeeded() { if (MContext || !MIsDefaultConstructed) return; @@ -114,12 +114,12 @@ const sycl::detail::pi::PiEvent &event_impl::getHandleRef() const { sycl::detail::pi::PiEvent &event_impl::getHandleRef() { return MEvent; } const ContextImplPtr &event_impl::getContextImpl() { - tryToInitContext(); + initContextIfNeeded(); return MContext; } const PluginPtr &event_impl::getPlugin() { - tryToInitContext(); + initContextIfNeeded(); return MContext->getPlugin(); } @@ -456,7 +456,7 @@ void HostProfilingInfo::end() { EndTime = getTimestamp(); } pi_native_handle event_impl::getNative() { if (isHost()) return {}; - tryToInitContext(); + initContextIfNeeded(); auto Plugin = getPlugin(); if (MIsDefaultConstructed && !MEvent) { diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index f4c2ac2e90a86..e52ac40ad78d7 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -274,7 +274,7 @@ class event_impl { ContextImplPtr getContextImplPtr() { if (MIsDefaultConstructed) - tryToInitContext(); + initContextIfNeeded(); return MContext; } @@ -400,7 +400,7 @@ class event_impl { // Events constructed without a context will lazily use the default context // when needed. - void tryToInitContext(); + void initContextIfNeeded(); // Event class represents 3 different kinds of operations: // | type | has PI event | MContext | MIsHostTask | MIsDefaultConstructed | // | dev | true | !nullptr | false | false | From 3fb26e0fdc88ee470b6a360f0fda3f3a35137b9c Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Mon, 1 Jul 2024 05:35:49 -0700 Subject: [PATCH 50/58] fix format Signed-off-by: Tikhomirova, Kseniya --- sycl/include/sycl/handler.hpp | 8 ++++---- sycl/source/detail/queue_impl.cpp | 3 +-- sycl/source/detail/scheduler/commands.cpp | 12 +++++++----- sycl/source/detail/scheduler/graph_builder.cpp | 5 ++--- sycl/source/detail/scheduler/scheduler.cpp | 5 ++--- sycl/source/detail/xpti_registry.cpp | 4 ++-- sycl/source/detail/xpti_registry.hpp | 2 +- sycl/source/handler.cpp | 15 +++++++-------- sycl/test-e2e/Config/allowlist.cpp | 2 +- .../scheduler/EnqueueWithDependsOnDeps.cpp | 8 ++++---- 10 files changed, 31 insertions(+), 33 deletions(-) diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index ec59dc8aece7c..61b23ffd707d5 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -488,8 +488,8 @@ class __SYCL_EXPORT handler { /// \param IsHost indicates if this handler is created for SYCL host device. /// \param CallerNeedsEvent indicates if the event resulting from this handler /// is needed by the caller. - handler(std::shared_ptr Queue, bool /* ABI break: remove */, - bool CallerNeedsEvent); + handler(std::shared_ptr Queue, + bool /* ABI break: remove */, bool CallerNeedsEvent); /// Constructs SYCL handler from the associated queue and the submission's /// primary and secondary queue. @@ -504,8 +504,8 @@ class __SYCL_EXPORT handler { /// is needed by the caller. handler(std::shared_ptr Queue, std::shared_ptr PrimaryQueue, - std::shared_ptr SecondaryQueue, bool /* ABI break: remove */, - bool CallerNeedsEvent); + std::shared_ptr SecondaryQueue, + bool /* ABI break: remove */, bool CallerNeedsEvent); /// Constructs SYCL handler from Graph. /// diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 45ca3aa0b2291..588254743701f 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -354,8 +354,7 @@ event queue_impl::submit_impl(const std::function &CGF, bool CallerNeedsEvent, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess) { - handler Handler(Self, PrimaryQueue, SecondaryQueue, false, - CallerNeedsEvent); + handler Handler(Self, PrimaryQueue, SecondaryQueue, false, CallerNeedsEvent); Handler.saveCodeLoc(Loc); { diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 0b7f38d6e429d..38aa77e0c92ed 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -80,7 +80,8 @@ static size_t deviceToID(const device &Device) { } static void addDeviceMetadata(xpti_td *TraceEvent, const QueueImplPtr &Queue) { - xpti::addMetadata(TraceEvent, "sycl_device_type", queueDeviceToString(Queue.get())); + xpti::addMetadata(TraceEvent, "sycl_device_type", + queueDeviceToString(Queue.get())); if (Queue) { xpti::addMetadata(TraceEvent, "sycl_device", deviceToID(Queue->get_device())); @@ -3099,8 +3100,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { return PI_SUCCESS; } case CG::CGTYPE::BarrierWaitlist: { - assert(MQueue && - "Barrier submission should have an associated queue"); + assert(MQueue && "Barrier submission should have an associated queue"); CGBarrier *Barrier = static_cast(MCommandGroup.get()); std::vector Events = Barrier->MEventsWaitWithBarrier; std::vector PiEvents = @@ -3168,7 +3168,8 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { typeSize, RawEvents, EventImpl, read); } case CG::CGTYPE::ExecCommandBuffer: { - assert(MQueue && "Command buffer submissions should have an associated queue"); + assert(MQueue && + "Command buffer submissions should have an associated queue"); CGExecCommandBuffer *CmdBufferCG = static_cast(MCommandGroup.get()); if (MEvent != nullptr) @@ -3192,7 +3193,8 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { return PI_SUCCESS; } case CG::CGTYPE::SemaphoreWait: { - assert(MQueue && "Semaphore wait submissions should have an associated queue"); + assert(MQueue && + "Semaphore wait submissions should have an associated queue"); CGSemaphoreWait *SemWait = (CGSemaphoreWait *)MCommandGroup.get(); const detail::PluginPtr &Plugin = MQueue->getPlugin(); diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index dcd4a0aa96dce..f8397016fce41 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -1339,9 +1339,8 @@ Command *Scheduler::GraphBuilder::connectDepEvent( /* DepEvents = */ {DepEvent}), CG::CodeplayHostTask, /* Payload */ {})); - ConnectCmd = new ExecCGCommand( - std::move(ConnectCG), nullptr, - /*EventNeeded=*/true); + ConnectCmd = new ExecCGCommand(std::move(ConnectCG), nullptr, + /*EventNeeded=*/true); } catch (const std::bad_alloc &) { throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY); } diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index cea700a311b7d..fbea6f14dea3d 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -115,9 +115,8 @@ EventImplPtr Scheduler::addCG( NewEvent = NewCmd->getEvent(); break; case CG::CodeplayHostTask: { - auto Result = - MGraphBuilder.addCG(std::move(CommandGroup), nullptr, - AuxiliaryCmds, EventNeeded); + auto Result = MGraphBuilder.addCG(std::move(CommandGroup), nullptr, + AuxiliaryCmds, EventNeeded); NewCmd = Result.NewCmd; NewEvent = Result.NewEvent; ShouldEnqueue = Result.ShouldEnqueue; diff --git a/sycl/source/detail/xpti_registry.cpp b/sycl/source/detail/xpti_registry.cpp index ed629b39b9be0..1884f5cd34265 100644 --- a/sycl/source/detail/xpti_registry.cpp +++ b/sycl/source/detail/xpti_registry.cpp @@ -7,8 +7,8 @@ //===----------------------------------------------------------------------===// #include -#include #include +#include #ifdef XPTI_ENABLE_INSTRUMENTATION #include "xpti/xpti_trace_framework.hpp" @@ -363,7 +363,7 @@ void XPTIRegistry::sampledImageHostAccessorNotification( #endif } -std::string queueDeviceToString(const queue_impl* const &Queue) { +std::string queueDeviceToString(const queue_impl *const &Queue) { if (!Queue) return "HOST"; auto Device = Queue->get_device(); diff --git a/sycl/source/detail/xpti_registry.hpp b/sycl/source/detail/xpti_registry.hpp index a66ac46a0cd34..356679a75c2fb 100644 --- a/sycl/source/detail/xpti_registry.hpp +++ b/sycl/source/detail/xpti_registry.hpp @@ -320,7 +320,7 @@ class XPTIScope { #endif class queue_impl; -std::string queueDeviceToString(const detail::queue_impl* const &Queue); +std::string queueDeviceToString(const detail::queue_impl *const &Queue); } // namespace detail } // namespace _V1 diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 011d3c4efce22..72277bb39ed31 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -87,8 +87,7 @@ handler::handler(std::shared_ptr Queue, bool) /// TODO: Unused. Remove with ABI break. handler::handler(std::shared_ptr Queue, std::shared_ptr PrimaryQueue, - std::shared_ptr SecondaryQueue, - bool) + std::shared_ptr SecondaryQueue, bool) : handler(Queue, PrimaryQueue, SecondaryQueue, false, /*CallerNeedsEvent=*/true) {} @@ -98,8 +97,8 @@ handler::handler(std::shared_ptr Queue, bool, handler::handler(std::shared_ptr Queue, std::shared_ptr PrimaryQueue, - std::shared_ptr SecondaryQueue, - bool, bool CallerNeedsEvent) + std::shared_ptr SecondaryQueue, bool, + bool CallerNeedsEvent) : MImpl(std::make_shared(std::move(PrimaryQueue), std::move(SecondaryQueue), CallerNeedsEvent)), @@ -287,10 +286,10 @@ event handler::finalize() { detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent, xpti::trace_task_begin, nullptr); #endif - Result = enqueueImpKernel( - MQueue, MNDRDesc, MArgs, KernelBundleImpPtr, MKernel, - MKernelName.c_str(), RawEvents, NewEvent, nullptr, - MImpl->MKernelCacheConfig, MImpl->MKernelIsCooperative); + Result = enqueueImpKernel(MQueue, MNDRDesc, MArgs, KernelBundleImpPtr, + MKernel, MKernelName.c_str(), RawEvents, + NewEvent, nullptr, MImpl->MKernelCacheConfig, + MImpl->MKernelIsCooperative); #ifdef XPTI_ENABLE_INSTRUMENTATION // Emit signal only when event is created if (NewEvent != nullptr) { diff --git a/sycl/test-e2e/Config/allowlist.cpp b/sycl/test-e2e/Config/allowlist.cpp index 063ebabc1aba5..56dfbc081fb06 100644 --- a/sycl/test-e2e/Config/allowlist.cpp +++ b/sycl/test-e2e/Config/allowlist.cpp @@ -83,7 +83,7 @@ int main() { // Expected the allowlist to be set but empty if (getenv("TEST_DEVICE_IS_NOT_AVAILABLE")) { - if (!sycl::platform::get_platforms().empty()) + if (!sycl::platform::get_platforms().empty()) throw std::runtime_error("Expected no device is available"); return 0; } diff --git a/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp b/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp index 5ad8a17af15d9..31d4e92bf89a8 100644 --- a/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp +++ b/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp @@ -78,10 +78,10 @@ class DependsOnTests : public ::testing::Test { std::unique_ptr CmdGroup = MockCGH.finalize(); - detail::Command *NewCmd = MS.addCG( - std::move(CmdGroup), - Type == TestCGType::HOST_TASK ? nullptr : QueueDevImpl, - ToEnqueue, /*EventNeeded=*/true); + detail::Command *NewCmd = + MS.addCG(std::move(CmdGroup), + Type == TestCGType::HOST_TASK ? nullptr : QueueDevImpl, + ToEnqueue, /*EventNeeded=*/true); EXPECT_EQ(ToEnqueue.size(), 0u); return NewCmd; } From 67a546270431a328f5920883732bce9820c394df Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Mon, 1 Jul 2024 05:42:16 -0700 Subject: [PATCH 51/58] fix format 2 Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/queue_impl.hpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 4e9936fe042fb..123efc3d87af6 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -350,9 +350,7 @@ class queue_impl { bool hasDiscardEventsProperty() const { return MDiscardEvents; } /// \return true if this queue allows for discarded events. - bool supportsDiscardingPiEvents() const { - return MIsInorder; - } + bool supportsDiscardingPiEvents() const { return MIsInorder; } bool isInOrder() const { return MIsInorder; } From 76a073c7d04b31c7952d1ce3f6e9dda37f36e800 Mon Sep 17 00:00:00 2001 From: "Tikhomirova, Kseniya" Date: Mon, 1 Jul 2024 10:09:15 -0700 Subject: [PATCH 52/58] update win symbols Signed-off-by: Tikhomirova, Kseniya --- sycl/test/abi/sycl_symbols_windows.dump | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 54c7a77403c92..d02be89140c5a 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -569,10 +569,10 @@ ??0half@host_half_impl@detail@_V1@sycl@@QEAA@AEBM@Z ??0half@host_half_impl@detail@_V1@sycl@@QEAA@G@Z ??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vgraph_impl@detail@experimental@oneapi@ext@_V1@sycl@@@std@@@Z -??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@00_N@Z ??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@00_N1@Z -??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_N@Z +??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@00_N@Z ??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_N1@Z +??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_N@Z ??0host_selector@_V1@sycl@@QEAA@$$QEAV012@@Z ??0host_selector@_V1@sycl@@QEAA@AEBV012@@Z ??0host_selector@_V1@sycl@@QEAA@XZ @@ -4084,7 +4084,6 @@ ?frexp_impl@detail@_V1@sycl@@YA?AVhalf@half_impl@123@V45123@PEAH@Z ?frexp_impl@detail@_V1@sycl@@YAMMPEAH@Z ?frexp_impl@detail@_V1@sycl@@YANNPEAH@Z -?generateFlushCommand@stream_impl@detail@_V1@sycl@@QEAAXAEAVhandler@34@@Z ?get@context@_V1@sycl@@QEBAPEAU_cl_context@@XZ ?get@device@_V1@sycl@@QEBAPEAU_cl_device_id@@XZ ?get@kernel@_V1@sycl@@QEBAPEAU_cl_kernel@@XZ From 741795d41e86599198e924f677e635cd38f67d5e Mon Sep 17 00:00:00 2001 From: Lorenc Bushi <113361374+lbushi25@users.noreply.github.com> Date: Mon, 1 Jul 2024 08:40:49 -0400 Subject: [PATCH 53/58] [SYCL] Fix assertion failure in E2E marray test (#14234) This PR fixes a GPU accuracy bug by upscaling the error-tolerance to a double type if the GPU supports 64-bit floating point arithmetic. --- sycl/test-e2e/Basic/built-ins/helpers.hpp | 28 ++++++++++++++++------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/sycl/test-e2e/Basic/built-ins/helpers.hpp b/sycl/test-e2e/Basic/built-ins/helpers.hpp index 03a7c720e9afd..724e417c4d6e0 100644 --- a/sycl/test-e2e/Basic/built-ins/helpers.hpp +++ b/sycl/test-e2e/Basic/built-ins/helpers.hpp @@ -33,16 +33,28 @@ void test(bool CheckDevice, double delta, FuncTy F, ExpectedTy Expected, sycl::buffer SuccessBuf{1}; + sycl::queue q; + sycl::device dev = q.get_device(); // Make sure we don't use fp64 on devices that don't support it. - sycl::detail::get_elem_type_t d(delta); - - sycl::queue{}.submit([&](sycl::handler &cgh) { + const bool fp64 = dev.has(sycl::aspect::fp64); + q.submit([&](sycl::handler &cgh) { sycl::accessor Success{SuccessBuf, cgh}; - cgh.single_task([=]() { - auto R = F(Args...); - static_assert(std::is_same_v); - Success[0] = equal(R, Expected, d); - }); + if (fp64) { + cgh.single_task([=]() { + auto R = F(Args...); + static_assert(std::is_same_v); + // use double precision error tolerance when fp64 supported + Success[0] = equal(R, Expected, delta); + }); + } else { + // downscale the error tolerance when fp64 is not supported + sycl::detail::get_elem_type_t d(delta); + cgh.single_task([=]() { + auto R = F(Args...); + static_assert(std::is_same_v); + Success[0] = equal(R, Expected, d); + }); + } }); assert(sycl::host_accessor{SuccessBuf}[0]); } From ec9059089635dba20989427739e3ea2694f604c9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 1 Jul 2024 14:37:31 +0000 Subject: [PATCH 54/58] Bump the github-actions group with 2 updates (#14365) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps the github-actions group with 2 updates: [github/codeql-action](https://github.com/github/codeql-action) and [softprops/action-gh-release](https://github.com/softprops/action-gh-release). Updates `github/codeql-action` from 3.25.7 to 3.25.11
Changelog

Sourced from github/codeql-action's changelog.

CodeQL Action Changelog

See the releases page for the relevant changes to the CodeQL CLI and language packs.

Note that the only difference between v2 and v3 of the CodeQL Action is the node version they support, with v3 running on node 20 while we continue to release v2 to support running on node 16. For example 3.22.11 was the first v3 release and is functionally identical to 2.22.11. This approach ensures an easy way to track exactly which features are included in different versions, indicated by the minor and patch version numbers.

[UNRELEASED]

No user facing changes.

3.25.11 - 28 Jun 2024

  • Avoid failing the workflow run if there is an error while uploading debug artifacts. #2349
  • Update default CodeQL bundle version to 2.17.6. #2352

3.25.10 - 13 Jun 2024

  • Update default CodeQL bundle version to 2.17.5. #2327

3.25.9 - 12 Jun 2024

  • Avoid failing database creation if the database folder already exists and contains some unexpected files. Requires CodeQL 2.18.0 or higher. #2330
  • The init Action will attempt to clean up the database cluster directory before creating a new database and at the end of the job. This will help to avoid issues where the database cluster directory is left in an inconsistent state. #2332

3.25.8 - 04 Jun 2024

  • Update default CodeQL bundle version to 2.17.4. #2321

3.25.7 - 31 May 2024

  • We are rolling out a feature in May/June 2024 that will reduce the Actions cache usage of the Action by keeping only the newest TRAP cache for each language. #2306

3.25.6 - 20 May 2024

  • Update default CodeQL bundle version to 2.17.3. #2295

3.25.5 - 13 May 2024

  • Add a compatibility matrix of supported CodeQL Action, CodeQL CLI, and GitHub Enterprise Server versions to the https://github.com/github/codeql-action/blob/main/README.md. #2273
  • Avoid printing out a warning for a missing on.push trigger when the CodeQL Action is triggered via a workflow_call event. #2274
  • The tools: latest input to the init Action has been renamed to tools: linked. This option specifies that the Action should use the tools shipped at the same time as the Action. The old name will continue to work for backwards compatibility, but we recommend that new workflows use the new name. #2281

3.25.4 - 08 May 2024

  • Update default CodeQL bundle version to 2.17.2. #2270

3.25.3 - 25 Apr 2024

  • Update default CodeQL bundle version to 2.17.1. #2247
  • Workflows running on macos-latest using CodeQL CLI versions before v2.15.1 will need to either upgrade their CLI version to v2.15.1 or newer, or change the platform to an Intel MacOS runner, such as macos-12. ARM machines with SIP disabled, including the newest macos-latest image, are unsupported for CLI versions before 2.15.1. #2261

... (truncated)

Commits
  • b611370 Merge pull request #2357 from github/update-v3.25.11-de945755c
  • 3e6431f Update changelog for v3.25.11
  • de94575 Merge pull request #2352 from github/update-bundle/codeql-bundle-v2.17.6
  • a32d305 Add changelog note
  • 9ccc995 Update default bundle to codeql-bundle-v2.17.6
  • 9b7c22c Merge pull request #2351 from github/dependabot/npm_and_yarn/npm-6791eaa26c
  • 9cf3243 Rebuild
  • 1895b29 Update checked-in dependencies
  • 9dcfde9 Bump the npm group with 2 updates
  • 8723b5b Merge pull request #2350 from github/angelapwen/add-exclude-pr-check-param
  • Additional commits viewable in compare view

Updates `softprops/action-gh-release` from 2.0.5 to 2.0.6
Release notes

Sourced from softprops/action-gh-release's releases.

v2.0.6

maintenance release with updated dependencies

Changelog

Sourced from softprops/action-gh-release's changelog.

2.0.6

  • maintenance release with updated dependencies

2.0.5

2.0.4

2.0.3

  • Declare make_latest as an input field in action.yml #419

2.0.2

  • Revisit approach to #384 making unresolved pattern failures opt-in #417

2.0.1

2.0.0

  • 2.0.0!? this release corrects a disjunction between git tag versions used in the marketplace and versions list this file. Previous versions should have really been 1.*. Going forward this should be better aligned.
  • Upgrade action.yml declaration to node20 to address deprecations

0.1.15

  • Upgrade to action.yml declaration to node16 to address deprecations
  • Upgrade dependencies
  • Add asset output as a JSON array containing information about the uploaded assets

0.1.14

  • provides an new workflow input option generate_release_notes which when set to true will automatically generate release notes for you based on GitHub activity #179. Please see the GitHub docs for this feature for more information

0.1.13

  • fix issue with multiple runs concatenating release bodies #145

0.1.12

  • fix bug leading to empty strings subsituted for inputs users don't provide breaking api calls #144

... (truncated)

Commits
  • a74c6b7 update changelog
  • b909f76 update dist/index.js
  • e49d08f chore(deps): bump glob from 8.0.3 to 10.4.2
  • f12ad25 chore(deps): bump @​octokit/plugin-throttling from 4.3.2 to 9.3.0
  • 7039a82 chore: release 2.0.6
  • f9c2b6c chore: update deps and run build
  • 73738a6 chore(deps): bump node dep and @types/node
  • a500a35 Bump ts-jest from 29.0.3 to 29.1.4 (#459)
  • See full diff in compare view

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore major version` will close this group update PR and stop Dependabot creating any more for the specific dependency's major version (unless you unignore this specific dependency's major version or upgrade to it yourself) - `@dependabot ignore minor version` will close this group update PR and stop Dependabot creating any more for the specific dependency's minor version (unless you unignore this specific dependency's minor version or upgrade to it yourself) - `@dependabot ignore ` will close this group update PR and stop Dependabot creating any more for the specific dependency (unless you unignore this specific dependency or upgrade to it yourself) - `@dependabot unignore ` will remove all of the ignore conditions of the specified dependency - `@dependabot unignore ` will remove the ignore condition of the specified dependency and ignore conditions
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/scorecard.yml | 2 +- .github/workflows/sycl-nightly.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml index 9f8ea3499f696..896a2ea8c183a 100644 --- a/.github/workflows/scorecard.yml +++ b/.github/workflows/scorecard.yml @@ -57,6 +57,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@f079b8493333aace61c81488f8bd40919487bd9f # v3.25.7 + uses: github/codeql-action/upload-sarif@b611370bb5703a7efb587f9d136a52ea24c5c38c # v3.25.11 with: sarif_file: results.sarif diff --git a/.github/workflows/sycl-nightly.yml b/.github/workflows/sycl-nightly.yml index fc0b90be7990a..32a7814fa1c5c 100644 --- a/.github/workflows/sycl-nightly.yml +++ b/.github/workflows/sycl-nightly.yml @@ -141,7 +141,7 @@ jobs: echo "TAG=$(date +'%Y-%m-%d')-${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT" fi - name: Upload binaries - uses: softprops/action-gh-release@69320dbe05506a9a39fc8ae11030b214ec2d1f87 + uses: softprops/action-gh-release@a74c6b72af54cfa997e81df42d94703d6313a2d0 with: files: | sycl_linux.tar.gz From 4c4f1b6b6927135a8743af336155ace780cc53c6 Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Mon, 1 Jul 2024 10:45:20 -0400 Subject: [PATCH 55/58] [SYCL][E2E] Disable flaky test host_task_last.cpp on Gen12 Linux (#14352) https://github.com/intel/llvm/issues/14350 --------- Signed-off-by: Sarnie, Nick --- sycl/test-e2e/Graph/Explicit/host_task_last.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sycl/test-e2e/Graph/Explicit/host_task_last.cpp b/sycl/test-e2e/Graph/Explicit/host_task_last.cpp index 34df0750b5366..5371ea1df3708 100644 --- a/sycl/test-e2e/Graph/Explicit/host_task_last.cpp +++ b/sycl/test-e2e/Graph/Explicit/host_task_last.cpp @@ -2,8 +2,10 @@ // RUN: %{run} %t.out // Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// Disabled due to https://github.com/intel/llvm/issues/14350 // Extra run to check for immediate-command-list in Level Zero -// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// xRUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // REQUIRES: aspect-usm_shared_allocations From 3d90aba9c957cdd302c89eabb8be2b4cee7798e1 Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Mon, 1 Jul 2024 08:23:17 -0700 Subject: [PATCH 56/58] [SYCL] Don't throw in `device_impl::has` (#14355) 1) It isn't right 2) We need this change to get rid of deprecated `sycl::exception::get_cl_code` --- sycl/source/detail/device_impl.cpp | 4 ++-- sycl/source/device.cpp | 13 +++---------- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp index c0b28622b962d..6e2b69850d5e1 100644 --- a/sycl/source/detail/device_impl.cpp +++ b/sycl/source/detail/device_impl.cpp @@ -719,8 +719,8 @@ bool device_impl::has(aspect Aspect) const { return call_successful && support; } } - throw runtime_error("This device aspect has not been implemented yet.", - PI_ERROR_INVALID_DEVICE); + + return false; // This device aspect has not been implemented yet. } bool device_impl::isAssertFailSupported() const { diff --git a/sycl/source/device.cpp b/sycl/source/device.cpp index 18b9cf4036cda..423ff7be44121 100644 --- a/sycl/source/device.cpp +++ b/sycl/source/device.cpp @@ -155,16 +155,9 @@ device::get_info_impl() const { #undef __SYCL_ASPECT }; - auto UnsupportedAspects = std::remove_if( - DeviceAspects.begin(), DeviceAspects.end(), [&](aspect Aspect) { - try { - return !impl->has(Aspect); - } catch (const runtime_error &ex) { - if (ex.get_cl_code() == PI_ERROR_INVALID_DEVICE) - return true; - throw; - } - }); + auto UnsupportedAspects = + std::remove_if(DeviceAspects.begin(), DeviceAspects.end(), + [&](aspect Aspect) { return !impl->has(Aspect); }); DeviceAspects.erase(UnsupportedAspects, DeviceAspects.end()); From 40170305048cbbf40e229c6c83a2c5ec5d6278e1 Mon Sep 17 00:00:00 2001 From: Steffen Larsen Date: Mon, 1 Jul 2024 17:31:45 +0200 Subject: [PATCH 57/58] [SYCL][Docs] Add sycl_ext_oneapi_virtual_mem extension and implementation (#8954) This commit adds the sycl_ext_oneapi_virtual_mem experimental extension for reserving and mapping virtual address ranges. Accompanying it is the implementation in the SYCL runtime, together with CUDA and Level Zero backend support for the corresponding features. --------- Signed-off-by: Larsen, Steffen --- .../llvm/SYCLLowerIR/DeviceConfigFile.td | 3 +- .../sycl_ext_oneapi_virtual_mem.asciidoc | 398 ++++++++++++++++++ sycl/include/sycl/detail/pi.def | 12 + sycl/include/sycl/detail/pi.h | 145 ++++++- sycl/include/sycl/detail/pi.hpp | 2 + sycl/include/sycl/device_aspect_macros.hpp | 10 + .../ext/oneapi/virtual_mem/physical_mem.hpp | 81 ++++ .../ext/oneapi/virtual_mem/virtual_mem.hpp | 61 +++ sycl/include/sycl/info/aspects.def | 1 + sycl/include/sycl/sycl.hpp | 2 + sycl/plugins/cuda/pi_cuda.cpp | 63 +++ sycl/plugins/cuda/pi_cuda.hpp | 5 + sycl/plugins/hip/pi_hip.cpp | 63 +++ sycl/plugins/hip/pi_hip.hpp | 5 + sycl/plugins/level_zero/pi_level_zero.cpp | 138 ++++++ sycl/plugins/native_cpu/pi_native_cpu.cpp | 63 +++ sycl/plugins/native_cpu/pi_native_cpu.hpp | 5 + sycl/plugins/opencl/pi_opencl.cpp | 63 +++ sycl/plugins/unified_runtime/pi2ur.hpp | 217 ++++++++++ .../unified_runtime/pi_unified_runtime.cpp | 66 +++ sycl/source/CMakeLists.txt | 2 + sycl/source/detail/device_impl.cpp | 8 + sycl/source/detail/physical_mem_impl.hpp | 95 +++++ sycl/source/feature_test.hpp.in | 1 + sycl/source/physical_mem.cpp | 38 ++ sycl/source/virtual_mem.cpp | 183 ++++++++ .../VirtualMem/vector_with_virtual_mem.cpp | 236 +++++++++++ sycl/test/abi/pi_cuda_symbol_check.dump | 10 + sycl/test/abi/pi_hip_symbol_check.dump | 10 + sycl/test/abi/pi_level_zero_symbol_check.dump | 10 + sycl/test/abi/pi_nativecpu_symbol_check.dump | 10 + sycl/test/abi/pi_opencl_symbol_check.dump | 10 + sycl/test/abi/sycl_symbols_linux.dump | 13 + sycl/test/abi/sycl_symbols_windows.dump | 27 +- sycl/unittests/helpers/PiMockPlugin.hpp | 55 +++ 35 files changed, 2108 insertions(+), 3 deletions(-) create mode 100644 sycl/doc/extensions/experimental/sycl_ext_oneapi_virtual_mem.asciidoc create mode 100644 sycl/include/sycl/ext/oneapi/virtual_mem/physical_mem.hpp create mode 100644 sycl/include/sycl/ext/oneapi/virtual_mem/virtual_mem.hpp create mode 100644 sycl/source/detail/physical_mem_impl.hpp create mode 100644 sycl/source/physical_mem.cpp create mode 100644 sycl/source/virtual_mem.cpp create mode 100644 sycl/test-e2e/VirtualMem/vector_with_virtual_mem.cpp diff --git a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td index 38d5f2512a1c4..54357d1377c77 100644 --- a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td +++ b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td @@ -82,6 +82,7 @@ def AspectExt_intel_fpga_task_sequence : Aspect<"ext_intel_fpga_task_sequence">; def AspectExt_oneapi_limited_graph : Aspect<"ext_oneapi_limited_graph">; def AspectExt_oneapi_private_alloca : Aspect<"ext_oneapi_private_alloca">; def AspectExt_oneapi_queue_profiling_tag : Aspect<"ext_oneapi_queue_profiling_tag">; +def AspectExt_oneapi_virtual_mem : Aspect<"ext_oneapi_virtual_mem">; // Deprecated aspects def AspectInt64_base_atomics : Aspect<"int64_base_atomics">; def AspectInt64_extended_atomics : Aspect<"int64_extended_atomics">; @@ -139,7 +140,7 @@ def : TargetInfo<"__TestAspectList", AspectExt_oneapi_ballot_group, AspectExt_oneapi_fixed_size_group, AspectExt_oneapi_opportunistic_group, AspectExt_oneapi_tangle_group, AspectExt_intel_matrix, AspectExt_oneapi_is_composite, AspectExt_oneapi_is_component, AspectExt_oneapi_graph, AspectExt_intel_fpga_task_sequence, AspectExt_oneapi_limited_graph, - AspectExt_oneapi_private_alloca, AspectExt_oneapi_queue_profiling_tag], + AspectExt_oneapi_private_alloca, AspectExt_oneapi_queue_profiling_tag, AspectExt_oneapi_virtual_mem], []>; // This definition serves the only purpose of testing whether the deprecated aspect list defined in here and in SYCL RT // match. diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_virtual_mem.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_virtual_mem.asciidoc new file mode 100644 index 0000000000000..72a6e1ed14f55 --- /dev/null +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_virtual_mem.asciidoc @@ -0,0 +1,398 @@ += sycl_ext_oneapi_virtual_mem + +:source-highlighter: coderay +:coderay-linenums-mode: table + +// This section needs to be after the document title. +:doctype: book +:toc2: +:toc: left +:encoding: utf-8 +:lang: en +:dpcpp: pass:[DPC++] +:endnote: —{nbsp}end{nbsp}note + +// Set the default source code type in this document to C++, +// for syntax highlighting purposes. This is needed because +// docbook uses c++ and html5 uses cpp. +:language: {basebackend@docbook:c++:cpp} + + +== Notice + +[%hardbreaks] +Copyright (C) 2023 Intel Corporation. All rights reserved. + +Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks +of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by +permission by Khronos. + + +== Contact + +To report problems with this extension, please open a new issue at: + +https://github.com/intel/llvm/issues + + +== Dependencies + +This extension is written against the SYCL 2020 revision 8 specification. All +references below to the "core SYCL specification" or to section numbers in the +SYCL specification refer to that revision. + + +== Status + +This is an experimental extension specification, intended to provide early +access to features and gather community feedback. Interfaces defined in this +specification are implemented in {dpcpp}, but they are not finalized and may +change incompatibly in future versions of {dpcpp} without prior notice. +*Shipping software products should not rely on APIs defined in this +specification.* + + +== Backend support status + +The APIs in this extension may be used only on a device that has +`aspect::ext_oneapi_virtual_mem`. The application must check that the devices +in the corresponding context have this aspect before using any of the APIs +introduced in this extension. If the application fails to do this, the +implementation throws a synchronous exception with the +`errc::feature_not_supported` error code. + +== Overview + +This extension adds the notion of "virtual memory ranges" to SYCL, introducing +a way to map an address range onto multiple allocations of physical memory, +allowing users to avoid expensive reallocations and potentially running out of +device memory while relocating the corresponding memory. + + +== Specification + +=== Feature test macro + +This extension provides a feature-test macro as described in the core SYCL +specification. An implementation supporting this extension must predefine the +macro `SYCL_EXT_ONEAPI_VIRTUAL_MEM` to one of the values defined in the table +below. Applications can test for the existence of this macro to determine if +the implementation supports this feature, or applications can test the macro's +value to determine which of the extension's features the implementation +supports. + +[%header,cols="1,5"] +|=== +|Value +|Description + +|1 +|The APIs of this experimental extension are not versioned, so the + feature-test macro always has this value. +|=== + + +=== Device aspect + +Support for the features introduced in this extension can be queried using the +new `aspect::ext_oneapi_virtual_mem` defined as: + +```c++ +namespace sycl { + +enum class aspect : /* unspecified */ { + ... + ext_oneapi_virtual_mem +} + +} // namespace sycl +``` + + +=== Memory granularity + +Working with virtual address ranges and the underlying physical memory requires +the user to align and adjust in accordance with a specified minimum granularity. + +The interfaces make the distinction between device granularity, which is the +granularity required for physical memory allocations, and context granularity, +which is the granularity required for virtual memory range reservations. + +The queries provide both a minimum and a recommended granularity. The minimum +device granularity is the smallest granularity that is supported for physical +memory allocations, and the minimum context granularity is the smallest +granularity that is supported from virtual memory range reservations. However, +the recommended granularity may be larger than these minimums and may provide +better performance. + +The interfaces for querying these granularities are defined as: + +```c++ +namespace sycl::ext::oneapi::experimental { + +enum class granularity_mode : /*unspecified*/ { + minimum, + recommended +}; + +size_t get_mem_granularity(const device &syclDevice, const context &syclContext, + granularity_mode mode = granularity_mode::recommended); + +size_t get_mem_granularity(const context &syclContext, + granularity_mode mode = granularity_mode::recommended); + +} // namespace sycl::ext::oneapi::experimental +``` + +[frame="topbot",options="header,footer"] +|===================== +|Function |Description + +|`size_t get_mem_granularity(const device &syclDevice, const context &syclContext, granularity_mode mode = granularity_mode::recommended)` | +Returns the granularity of physical memory allocations on `syclDevice` in the +`syclContext`. The `mode` argument specifies whether the query is for the +minimum or recommended granularity. + +If `syclDevice` does not have `aspect::ext_oneapi_virtual_mem` the call throws +an exception with `errc::feature_not_supported`. + +|`size_t get_mem_granularity(const context &syclContext, granularity_mode mode = granularity_mode::recommended)` | +Returns the granularity of virtual memory range reservations in the +`syclContext`. The `mode` argument specifies whether the query is for the +minimum or recommended granularity. + +If any device in `syclContext` does not have `aspect::ext_oneapi_virtual_mem` +the call throws an exception with `errc::feature_not_supported`. + +|===================== + +=== Reserving virtual address ranges + +Virtual address ranges are represented by a `uintptr_t` and a number of bytes +reserved for it. The `uintptr_t` must be aligned in accordance with the minimum +granularity of the corresponding `context`, as queried through +`get_mem_granularity`, and likewise the number of bytes must be a multiple of +this granularity. It is the responsibility of the user to manage the +constituents of any virtual address range they reserve. + +The interfaces for reserving, freeing, and manipulating the access mode of a +virtual address range are defined as: + +```c++ +namespace sycl::ext::oneapi::experimental { + +uintptr_t reserve_virtual_mem(uintptr_t start, size_t numBytes, const context &syclContext); +uintptr_t reserve_virtual_mem(size_t numBytes, const context &syclContext); + +void free_virtual_mem(uintptr_t ptr, size_t numBytes, const context &syclContext); + +} // namespace sycl::ext::oneapi::experimental +``` + +[frame="topbot",options="header,footer"] +|===================== +|Function |Description + +|`uintptr_t reserve_virtual_mem(uintptr_t start, size_t numBytes, const context &syclContext)` | +Reserves a virtual memory range in `syclContext` with `numBytes` bytes. + +`start` specifies the requested start of the new virtual memory range +reservation. If the implementation is unable to reserve the virtual memory range +at the specified address, the implementation will pick another suitable address. + +`start` must be aligned in accordance with the minimum granularity for +`syclContext`, as returned by a call to `get_mem_granularity`. Likewise, +`numBytes` must be a multiple of the minimum granularity. Attempting to call +this function without meeting these requirements results in undefined behavior. + +If any of the devices in `syclContext` do not have +`aspect::ext_oneapi_virtual_mem` the call throws an exception with +`errc::feature_not_supported`. + +|`uintptr_t reserve_virtual_mem(size_t numBytes, const context &syclContext)` | +Same as `reserve_virtual_mem(0, numBytes, syclContext)`. + +|`void free_virtual_mem(uintptr_t ptr, size_t numBytes, const context &syclContext)` | +Frees a virtual memory range specified by `ptr` and `numBytes`. `ptr` must be +the same as returned by a call to `reserve_virtual_mem` and `numBytes` must be +the same as the size of the range specified in the reservation call. + +The virtual memory range must not currently be mapped to physical memory. A call +to this function with a mapped virtual memory range results in undefined +behavior. + +|===================== + + +=== Physical memory representation + +:crs: https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:reference-semantics + +To represent the underlying physical device memory a virtual address is mapped +to, the `physical_mem` class is added. This new class is defined as: + +```c++ +namespace sycl::ext::oneapi::experimental { + +enum class address_access_mode : /*unspecified*/ { + none, + read, + read_write +}; + +class physical_mem { +public: + physical_mem(const device &syclDevice, const context &syclContext, size_t numBytes); + physical_mem(const queue &syclQueue, size_t numBytes); + + /* -- common interface members -- */ + + void *map(uintptr_t ptr, size_t numBytes, address_access_mode mode, size_t offset = 0) const; + + context get_context() const; + device get_device() const; + + size_t size() const noexcept; +}; + +} // namespace sycl::ext::oneapi::experimental +``` + +`physical_mem` has common reference semantics, as described in +{crs}[section 4.5.2. Common reference semantics]. + +[frame="topbot",options="header,footer"] +|============================ +|Member function |Description + +|`physical_mem(const device &syclDevice, const context &syclContext, size_t numBytes)` | +Constructs a `physical_mem` instance using the `syclDevice` provided. This +device must either be contained by `syclContext` or it must be a descendent +device of some device that is contained by that context, otherwise this function +throws a synchronous exception with the `errc::invalid` error code. + +This will allocate `numBytes` of physical memory on the device. `numBytes` must +be a multiple of the granularity for `syclDevice`, as returned by a call to +`get_mem_granularity`. + +If `syclDevice` does not have `aspect::ext_oneapi_virtual_mem` the call throws +an exception with `errc::feature_not_supported`. + +If the constructor is unable to allocate the required memory on `syclDevice`, +the call throws an exception with `errc::memory_allocation`. + +|`physical_mem(const queue &syclQueue, size_t numBytes)` | +Same as `physical_mem(syclQueue.get_device(), syclQueue.get_context, numBytes)`. + +|`void *map(uintptr_t ptr, size_t numBytes, address_access_mode mode, size_t offset = 0)` | +Maps a virtual memory range, specified by `ptr` and `numBytes`, to the physical +memory corresponding to this instance of `physical_mem`, starting at an offset +of `offset` bytes. + +It is required that `offset + numBytes` is less than or equal to `size()` and +that `ptr`, `numBytes` and `offset` are all multiples of the minimum granularity +for the device associated with this instance of `physical_mem`. + +If `mode` is `address_access_mode::read` or `address_access_mode::read_write` +the returned pointer is accessible after the call as read-only or read-write +respectively. Otherwise, it is considered inaccessible and accessing it will +result in undefined behavior. + +The returned pointer is equivalent to `reinterpret_cast(ptr)`. + +Writing to any address in the virtual memory range with access mode set to +`access_mode::read` results in undefined behavior. + +An accessible pointer behaves the same as a pointer to device USM memory and can +be used in place of a device USM pointer in any interface accepting one. + +A virtual memory range cannot be simultaneously mapped to more than one +physical memory region. Likewise, multiple virtual memory ranges cannot be +mapped onto the same physical memory region. Attempting to violate either of +these restrictions will result in undefined behavior. + +|`context get_context() const` | +Returns the SYCL context associated with the instance of `physical_mem`. + +|`device get_device() const` | +Returns the SYCL device associated with the instance of `physical_mem`. + +|`size_t size() const` | +Returns the size of the corresponding physical memory in bytes. + +|============================ + +Virtual memory address ranges are mapped to the a `physical_mem` through the +`map` member functions, where the access mode can also be specified. +To further get or set the access mode of a mapped virtual address range, the +user does not need to know the associated `physical_mem` and can just call the +following free functions. + +```c++ +namespace sycl::ext::oneapi::experimental { + +void set_access_mode(const void *ptr, size_t numBytes, address_access_mode mode, const context &syclContext); + +address_access_mode get_access_mode(const void *ptr, size_t numBytes, const context &syclContext); + +void unmap(const void *ptr, size_t numBytes, const context &syclContext); + +} // namespace sycl::ext::oneapi::experimental +``` + +[frame="topbot",options="header,footer"] +|===================== +|Function |Description + +|`void set_access_mode(const void *ptr, size_t numBytes, address_access_mode mode, const context &syclContext)` | +Changes the access mode of a mapped virtual memory range specified by `ptr` and +`numBytes`. + +If `mode` is `address_access_mode::read` or `address_access_mode::read_write` +`ptr` pointer is accessible after the call as read-only or read-write +respectively. Otherwise, it is considered inaccessible and accessing it will +result in undefined behavior. + +The virtual memory range specified by `ptr` and `numBytes` must be a sub-range +of virtual memory ranges previously mapped to `physical_mem`. `ptr` +must be aligned to the minimum memory granularity of the device associated with +the `physical_mem` the range is mapped to and `numBytes` must be a multiple of +the minimum memory granularity of the device associated with the `physical_mem` +the range is mapped to. + +Writing to any address in the virtual memory range with access mode set to +`address_access_mode::read` results in undefined behavior. + +An accessible pointer behaves the same as a pointer to device USM memory and can +be used in place of a device USM pointer in any interface accepting one. + +|`address_access_mode get_access_mode(const void *ptr, size_t numBytes, const context &syclContext)` | +Returns the access mode of the mapped virtual memory range specified by `ptr` +and `numBytes`. + +The virtual memory range specified by `ptr` and `numBytes` must be a sub-range +of virtual memory ranges previously mapped to `physical_mem`. `ptr` +must be aligned to the minimum memory granularity of the device associated with +the `physical_mem` the range is mapped to and `numBytes` must be a multiple of +the minimum memory granularity of the device associated with the `physical_mem` +the range is mapped to. + +|`void unmap(const void *ptr, size_t numBytes, const device &syclDevice, const context &syclContext)` | +Unmaps the range specified by `ptr` and `numBytes`. The range must have been +mapped through a call to `physical_mem::map()` prior to calling this. The range +must not be a proper sub-range of a previously mapped range. `syclContext` must +be the same as the context returned by the `get_context()` member function on +the `physical_mem` the address range is currently mapped to. + +After this call, the full range will again be ready to be mapped through a call +to `physical_mem::map()`. + +[_Note:_ Unmapping ranges that span multiple contiguous mapped ranges is not +supported. Doing so will result in undefined behavior. This restriction may be +lifted in the future. _{endnote}_] + +[_Note:_ The destructor for `physical_mem` will not unmap ranges mapped to it. +As such, the user must call `unmap` on ranges mapped to `physical_mem` objects +prior to their destruction. _{endnote}_] + +|===================== \ No newline at end of file diff --git a/sycl/include/sycl/detail/pi.def b/sycl/include/sycl/detail/pi.def index 995579d612afb..3090b2d488ee0 100644 --- a/sycl/include/sycl/detail/pi.def +++ b/sycl/include/sycl/detail/pi.def @@ -215,4 +215,16 @@ _PI_API(piextDestroyExternalSemaphore) _PI_API(piextWaitExternalSemaphore) _PI_API(piextSignalExternalSemaphore) +// Virtual memory +_PI_API(piextVirtualMemGranularityGetInfo) +_PI_API(piextPhysicalMemCreate) +_PI_API(piextPhysicalMemRetain) +_PI_API(piextPhysicalMemRelease) +_PI_API(piextVirtualMemReserve) +_PI_API(piextVirtualMemFree) +_PI_API(piextVirtualMemMap) +_PI_API(piextVirtualMemUnmap) +_PI_API(piextVirtualMemSetAccess) +_PI_API(piextVirtualMemGetInfo) + #undef _PI_API diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index 79d67791ffc8d..ce7d34ef75899 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -191,9 +191,13 @@ // `win32_nt_dx12_resource` value. // the `pi_external_semaphore_handle_type` enum now has a new // `win32_nt_dx12_fence` value. +// 15.54 Added piextVirtualMem* functions, and piextPhysicalMem* functions, +// PI_EXT_ONEAPI_DEVICE_INFO_SUPPORTS_VIRTUAL_MEM device info descriptor, +// _pi_virtual_mem_granularity_info enum, _pi_virtual_mem_info enum and +// pi_virtual_access_flags bit flags. #define _PI_H_VERSION_MAJOR 15 -#define _PI_H_VERSION_MINOR 53 +#define _PI_H_VERSION_MINOR 54 #define _PI_STRING_HELPER(a) #a #define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b) @@ -505,6 +509,9 @@ typedef enum { // Timestamp enqueue PI_EXT_ONEAPI_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT = 0x2011D, + + // Virtual memory support + PI_EXT_ONEAPI_DEVICE_INFO_SUPPORTS_VIRTUAL_MEM = 0x2011E, } _pi_device_info; typedef enum { @@ -756,6 +763,15 @@ typedef enum { PI_SAMPLER_CUBEMAP_FILTER_MODE_SEAMLESS = 0x1143, } _pi_sampler_cubemap_filter_mode; +typedef enum { + PI_EXT_ONEAPI_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM = 0x30100, + PI_EXT_ONEAPI_VIRTUAL_MEM_GRANULARITY_INFO_RECOMMENDED = 0x30101, +} _pi_virtual_mem_granularity_info; + +typedef enum { + PI_EXT_ONEAPI_VIRTUAL_MEM_INFO_ACCESS_MODE = 0x30200, +} _pi_virtual_mem_info; + using pi_context_properties = intptr_t; using pi_device_exec_capabilities = pi_bitfield; @@ -848,6 +864,10 @@ constexpr pi_queue_properties PI_EXT_QUEUE_FLAG_SUBMISSION_NO_IMMEDIATE = (1 << constexpr pi_queue_properties PI_EXT_QUEUE_FLAG_SUBMISSION_IMMEDIATE = (1 << 8); // clang-format on +using pi_virtual_access_flags = pi_bitfield; +constexpr pi_virtual_access_flags PI_VIRTUAL_ACCESS_FLAG_RW = (1 << 0); +constexpr pi_virtual_access_flags PI_VIRTUAL_ACCESS_FLAG_READ_ONLY = (1 << 1); + typedef enum { // No preference for SLM or data cache. PI_EXT_KERNEL_EXEC_INFO_CACHE_DEFAULT = 0x0, @@ -889,6 +909,8 @@ using pi_program_binary_type = _pi_program_binary_type; using pi_kernel_info = _pi_kernel_info; using pi_profiling_info = _pi_profiling_info; using pi_kernel_cache_config = _pi_kernel_cache_config; +using pi_virtual_mem_granularity_info = _pi_virtual_mem_granularity_info; +using pi_virtual_mem_info = _pi_virtual_mem_info; using pi_image_copy_flags = _pi_image_copy_flags; @@ -1241,6 +1263,7 @@ struct _pi_program; struct _pi_kernel; struct _pi_event; struct _pi_sampler; +struct _pi_physical_mem; using pi_platform = _pi_platform *; using pi_device = _pi_device *; @@ -1255,6 +1278,7 @@ using pi_image_handle = pi_uint64; using pi_image_mem_handle = void *; using pi_interop_mem_handle = pi_uint64; using pi_interop_semaphore_handle = pi_uint64; +using pi_physical_mem = _pi_physical_mem *; typedef struct { pi_image_channel_order image_channel_order; @@ -2338,6 +2362,125 @@ pi_result piextEnqueueDeviceGlobalVariableRead( size_t count, size_t offset, void *dst, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event); +/// +/// Virtual memory +/// + +/// API for getting information about the minimum and recommended granularity +/// of physical and virtual memory. +/// +/// \param context is the context to get the granularity from. +/// \param device is the device to get the granularity from. +/// \param param_name is the type of query to perform. +/// \param param_value_size is the size of the result in bytes. +/// \param param_value is the result. +/// \param param_value_size_ret is how many bytes were written. +__SYCL_EXPORT pi_result piextVirtualMemGranularityGetInfo( + pi_context context, pi_device device, + pi_virtual_mem_granularity_info param_name, size_t param_value_size, + void *param_value, size_t *param_value_size_ret); + +/// API for creating a physical memory handle that virtual memory can be mapped +/// to. +/// +/// \param context is the context within which the physical memory is allocated. +/// \param device is the device the physical memory is on. +/// \param mem_size is the size of physical memory to allocate. This must be a +/// multiple of the minimum virtual memory granularity. +/// \param ret_physical_mem is the handle for the resulting physical memory. +__SYCL_EXPORT pi_result +piextPhysicalMemCreate(pi_context context, pi_device device, size_t mem_size, + pi_physical_mem *ret_physical_mem); + +/// API for retaining a physical memory handle. +/// +/// \param physical_mem is the handle for the physical memory to retain. +__SYCL_EXPORT pi_result piextPhysicalMemRetain(pi_physical_mem physical_mem); + +/// API for releasing a physical memory handle. +/// +/// \param physical_mem is the handle for the physical memory to free. +__SYCL_EXPORT pi_result piextPhysicalMemRelease(pi_physical_mem physical_mem); + +/// API for reserving a virtual memory range. +/// +/// \param context is the context within which the virtual memory range is +/// reserved. +/// \param start is a pointer to the start of the region to reserve. If nullptr +/// the implementation selects a start address. +/// \param range_size is the size of the virtual address range to reserve in +/// bytes. +/// \param ret_ptr is the pointer to the start of the resulting virtual memory +/// range. +__SYCL_EXPORT pi_result piextVirtualMemReserve(pi_context context, + const void *start, + size_t range_size, + void **ret_ptr); + +/// API for freeing a virtual memory range. +/// +/// \param context is the context within which the virtual memory range is +/// reserved. +/// \param ptr is the pointer to the start of the virtual memory range. +/// \param range_size is the size of the virtual address range. +__SYCL_EXPORT pi_result piextVirtualMemFree(pi_context context, const void *ptr, + size_t range_size); + +/// API for mapping a virtual memory range to a a physical memory allocation at +/// a given offset. +/// +/// \param context is the context within which both the virtual memory range is +/// reserved and the physical memory is allocated. +/// \param ptr is the pointer to the start of the virtual memory range. +/// \param range_size is the size of the virtual address range. +/// \param physical_mem is the handle for the physical memory to map ptr to. +/// \param offset is the offset into physical_mem in bytes to map ptr to. +/// \param flags is the access flags to set for the mapping. +__SYCL_EXPORT pi_result piextVirtualMemMap(pi_context context, const void *ptr, + size_t range_size, + pi_physical_mem physical_mem, + size_t offset, + pi_virtual_access_flags flags); + +/// API for unmapping a virtual memory range previously mapped in a context. +/// After a call to this function, the virtual memory range is left in a state +/// ready to be remapped. +/// +/// \param context is the context within which the virtual memory range is +/// currently mapped. +/// \param ptr is the pointer to the start of the virtual memory range. +/// \param range_size is the size of the virtual address range in bytes. +__SYCL_EXPORT pi_result piextVirtualMemUnmap(pi_context context, + const void *ptr, + size_t range_size); + +/// API for setting the access mode of a mapped virtual memory range. +/// +/// \param context is the context within which the virtual memory range is +/// currently mapped. +/// \param ptr is the pointer to the start of the virtual memory range. +/// \param range_size is the size of the virtual address range in bytes. +/// \param flags is the access flags to set for the mapped virtual access range. +__SYCL_EXPORT pi_result piextVirtualMemSetAccess(pi_context context, + const void *ptr, + size_t range_size, + pi_virtual_access_flags flags); + +/// API for getting info about a mapped virtual memory range. +/// +/// \param context is the context within which the virtual memory range is +/// currently mapped. +/// \param ptr is the pointer to the start of the virtual memory range. +/// \param range_size is the size of the virtual address range in bytes. +/// \param param_name is the type of query to perform. +/// \param param_value_size is the size of the result in bytes. +/// \param param_value is the result. +/// \param param_value_size_ret is how many bytes were written. +__SYCL_EXPORT pi_result +piextVirtualMemGetInfo(pi_context context, const void *ptr, size_t range_size, + pi_virtual_mem_info param_name, size_t param_value_size, + void *param_value, size_t *param_value_size_ret); + /// /// Plugin /// diff --git a/sycl/include/sycl/detail/pi.hpp b/sycl/include/sycl/detail/pi.hpp index 3500c576bb599..1fe21d36a8aaa 100644 --- a/sycl/include/sycl/detail/pi.hpp +++ b/sycl/include/sycl/detail/pi.hpp @@ -146,6 +146,8 @@ using PiExternalMemDescriptor = ::pi_external_mem_descriptor; using PiExternalSemaphoreDescriptor = ::pi_external_semaphore_descriptor; using PiImageOffset = ::pi_image_offset_struct; using PiImageRegion = ::pi_image_region_struct; +using PiPhysicalMem = ::pi_physical_mem; +using PiVirtualAccessFlags = ::pi_virtual_access_flags; __SYCL_EXPORT void contextSetExtendedDeleter(const sycl::context &constext, pi_context_extended_deleter func, diff --git a/sycl/include/sycl/device_aspect_macros.hpp b/sycl/include/sycl/device_aspect_macros.hpp index df6c827de60f2..d756b0a62e88a 100644 --- a/sycl/include/sycl/device_aspect_macros.hpp +++ b/sycl/include/sycl/device_aspect_macros.hpp @@ -381,6 +381,11 @@ #define __SYCL_ALL_DEVICES_HAVE_ext_oneapi_queue_profiling_tag__ 0 #endif +#ifndef __SYCL_ALL_DEVICES_HAVE_ext_oneapi_virtual_mem__ +// __SYCL_ASPECT(ext_oneapi_virtual_mem, 74) +#define __SYCL_ALL_DEVICES_HAVE_ext_oneapi_virtual_mem__ 0 +#endif + #ifndef __SYCL_ANY_DEVICE_HAS_host__ // __SYCL_ASPECT(host, 0) #define __SYCL_ANY_DEVICE_HAS_host__ 0 @@ -750,3 +755,8 @@ // __SYCL_ASPECT(ext_oneapi_queue_profiling_tag, 73) #define __SYCL_ANY_DEVICE_HAS_ext_oneapi_queue_profiling_tag__ 0 #endif + +#ifndef __SYCL_ANY_DEVICE_HAS_ext_oneapi_virtual_mem__ +// __SYCL_ASPECT(ext_oneapi_virtual_mem, 74) +#define __SYCL_ANY_DEVICE_HAS_ext_oneapi_virtual_mem__ 0 +#endif diff --git a/sycl/include/sycl/ext/oneapi/virtual_mem/physical_mem.hpp b/sycl/include/sycl/ext/oneapi/virtual_mem/physical_mem.hpp new file mode 100644 index 0000000000000..24d371fe8c6fd --- /dev/null +++ b/sycl/include/sycl/ext/oneapi/virtual_mem/physical_mem.hpp @@ -0,0 +1,81 @@ +//==--- physical_mem.hpp - sycl_ext_oneapi_virtual_mem physical_mem class --==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace sycl { +inline namespace _V1 { + +namespace detail { +class physical_mem_impl; +} // namespace detail + +namespace ext::oneapi::experimental { + +enum class address_access_mode : char { none = 0, read = 1, read_write = 2 }; + +class __SYCL_EXPORT physical_mem + : public sycl::detail::OwnerLessBase { +public: + physical_mem(const device &SyclDevice, const context &SyclContext, + size_t NumBytes); + + physical_mem(const queue &SyclQueue, size_t NumBytes) + : physical_mem(SyclQueue.get_device(), SyclQueue.get_context(), + NumBytes) {} + + physical_mem(const physical_mem &rhs) = default; + physical_mem(physical_mem &&rhs) = default; + + physical_mem &operator=(const physical_mem &rhs) = default; + physical_mem &operator=(physical_mem &&rhs) = default; + + ~physical_mem() noexcept(false) {}; + + bool operator==(const physical_mem &rhs) const { return impl == rhs.impl; } + bool operator!=(const physical_mem &rhs) const { return !(*this == rhs); } + + void *map(uintptr_t Ptr, size_t NumBytes, address_access_mode Mode, + size_t Offset = 0) const; + + context get_context() const; + device get_device() const; + + size_t size() const noexcept; + +private: + std::shared_ptr impl; + + template + friend decltype(Obj::impl) + sycl::detail::getSyclObjImpl(const Obj &SyclObject); + + template + friend T sycl::detail::createSyclObjFromImpl(decltype(T::impl) ImplObj); +}; + +} // namespace ext::oneapi::experimental +} // namespace _V1 +} // namespace sycl + +namespace std { +template <> struct hash { + size_t operator()( + const sycl::ext::oneapi::experimental::physical_mem &PhysicalMem) const { + return hash>()( + sycl::detail::getSyclObjImpl(PhysicalMem)); + } +}; +} // namespace std diff --git a/sycl/include/sycl/ext/oneapi/virtual_mem/virtual_mem.hpp b/sycl/include/sycl/ext/oneapi/virtual_mem/virtual_mem.hpp new file mode 100644 index 0000000000000..74a42354eaa01 --- /dev/null +++ b/sycl/include/sycl/ext/oneapi/virtual_mem/virtual_mem.hpp @@ -0,0 +1,61 @@ +//==- virtual_mem.hpp - sycl_ext_oneapi_virtual_mem virtual mem free funcs -==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +namespace sycl { +inline namespace _V1 { +namespace ext::oneapi::experimental { + +enum class granularity_mode : char { + minimum = 0, + recommended = 1, +}; + +__SYCL_EXPORT size_t +get_mem_granularity(const device &SyclDevice, const context &SyclContext, + granularity_mode Mode = granularity_mode::recommended); + +__SYCL_EXPORT size_t +get_mem_granularity(const context &SyclContext, + granularity_mode Mode = granularity_mode::recommended); + +__SYCL_EXPORT uintptr_t reserve_virtual_mem(uintptr_t Start, size_t NumBytes, + const context &SyclContext); + +inline uintptr_t reserve_virtual_mem(size_t NumBytes, + const context &SyclContext) { + return reserve_virtual_mem(0, NumBytes, SyclContext); +} + +__SYCL_EXPORT void free_virtual_mem(uintptr_t Ptr, size_t NumBytes, + const context &SyclContext); + +__SYCL_EXPORT void set_access_mode(const void *Ptr, size_t NumBytes, + address_access_mode Mode, + const context &SyclContext); + +__SYCL_EXPORT address_access_mode get_access_mode(const void *Ptr, + size_t NumBytes, + const context &SyclContext); + +__SYCL_EXPORT void unmap(const void *Ptr, size_t NumBytes, + const context &SyclContext); + +} // Namespace ext::oneapi::experimental +} // namespace _V1 +} // Namespace sycl diff --git a/sycl/include/sycl/info/aspects.def b/sycl/include/sycl/info/aspects.def index 2d9cee1351d7a..3b744a89dbb90 100644 --- a/sycl/include/sycl/info/aspects.def +++ b/sycl/include/sycl/info/aspects.def @@ -68,3 +68,4 @@ __SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_2d, 70) __SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_3d_usm, 71) __SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_3d, 72) __SYCL_ASPECT(ext_oneapi_queue_profiling_tag, 73) +__SYCL_ASPECT(ext_oneapi_virtual_mem, 74) diff --git a/sycl/include/sycl/sycl.hpp b/sycl/include/sycl/sycl.hpp index 53a60381f0b8d..16b5e8f0f6c40 100644 --- a/sycl/include/sycl/sycl.hpp +++ b/sycl/include/sycl/sycl.hpp @@ -111,4 +111,6 @@ #include #include #include +#include +#include #include diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 0077b245905db..1628b1537fae5 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -1298,6 +1298,69 @@ pi_result piextPeerAccessGetInfo(pi_device command_device, ParamValueSizeRet); } +pi_result +piextVirtualMemGranularityGetInfo(pi_context context, pi_device device, + pi_virtual_mem_granularity_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) { + return pi2ur::piextVirtualMemGranularityGetInfo(context, device, param_name, + param_value_size, param_value, + param_value_size_ret); +} + +pi_result piextPhysicalMemCreate(pi_context context, pi_device device, + size_t mem_size, + pi_physical_mem *ret_physical_mem) { + return pi2ur::piextPhysicalMemCreate(context, device, mem_size, + ret_physical_mem); +} + +pi_result piextPhysicalMemRetain(pi_physical_mem physical_mem) { + return pi2ur::piextPhysicalMemRetain(physical_mem); +} + +pi_result piextPhysicalMemRelease(pi_physical_mem physical_mem) { + return pi2ur::piextPhysicalMemRelease(physical_mem); +} + +pi_result piextVirtualMemReserve(pi_context context, const void *start, + size_t range_size, void **ret_ptr) { + return pi2ur::piextVirtualMemReserve(context, start, range_size, ret_ptr); +} + +pi_result piextVirtualMemFree(pi_context context, const void *ptr, + size_t range_size) { + return pi2ur::piextVirtualMemFree(context, ptr, range_size); +} + +pi_result piextVirtualMemMap(pi_context context, const void *ptr, + size_t range_size, pi_physical_mem physical_mem, + size_t offset, pi_virtual_access_flags flags) { + return pi2ur::piextVirtualMemMap(context, ptr, range_size, physical_mem, + offset, flags); +} + +pi_result piextVirtualMemUnmap(pi_context context, const void *ptr, + size_t range_size) { + return pi2ur::piextVirtualMemUnmap(context, ptr, range_size); +} + +pi_result piextVirtualMemSetAccess(pi_context context, const void *ptr, + size_t range_size, + pi_virtual_access_flags flags) { + return pi2ur::piextVirtualMemSetAccess(context, ptr, range_size, flags); +} + +pi_result piextVirtualMemGetInfo(pi_context context, const void *ptr, + size_t range_size, + pi_virtual_mem_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) { + return pi2ur::piextVirtualMemGetInfo(context, ptr, range_size, param_name, + param_value_size, param_value, + param_value_size_ret); +} + const char SupportedVersion[] = _PI_CUDA_PLUGIN_VERSION_STRING; pi_result piPluginInit(pi_plugin *PluginInit) { diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index 2b5d77b26ea9d..8c5112f4cc9d1 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -81,4 +82,8 @@ struct _pi_ext_command_buffer : ur_exp_command_buffer_handle_t_ { using ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_; }; +struct _pi_physical_mem : ur_physical_mem_handle_t_ { + using ur_physical_mem_handle_t_::ur_physical_mem_handle_t_; +}; + #endif // PI_CUDA_HPP diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index 33b7388f9c884..c3324463690eb 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -1301,6 +1301,69 @@ pi_result piextPeerAccessGetInfo(pi_device command_device, ParamValueSizeRet); } +pi_result +piextVirtualMemGranularityGetInfo(pi_context context, pi_device device, + pi_virtual_mem_granularity_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) { + return pi2ur::piextVirtualMemGranularityGetInfo(context, device, param_name, + param_value_size, param_value, + param_value_size_ret); +} + +pi_result piextPhysicalMemCreate(pi_context context, pi_device device, + size_t mem_size, + pi_physical_mem *ret_physical_mem) { + return pi2ur::piextPhysicalMemCreate(context, device, mem_size, + ret_physical_mem); +} + +pi_result piextPhysicalMemRetain(pi_physical_mem physical_mem) { + return pi2ur::piextPhysicalMemRetain(physical_mem); +} + +pi_result piextPhysicalMemRelease(pi_physical_mem physical_mem) { + return pi2ur::piextPhysicalMemRelease(physical_mem); +} + +pi_result piextVirtualMemReserve(pi_context context, const void *start, + size_t range_size, void **ret_ptr) { + return pi2ur::piextVirtualMemReserve(context, start, range_size, ret_ptr); +} + +pi_result piextVirtualMemFree(pi_context context, const void *ptr, + size_t range_size) { + return pi2ur::piextVirtualMemFree(context, ptr, range_size); +} + +pi_result piextVirtualMemMap(pi_context context, const void *ptr, + size_t range_size, pi_physical_mem physical_mem, + size_t offset, pi_virtual_access_flags flags) { + return pi2ur::piextVirtualMemMap(context, ptr, range_size, physical_mem, + offset, flags); +} + +pi_result piextVirtualMemUnmap(pi_context context, const void *ptr, + size_t range_size) { + return pi2ur::piextVirtualMemUnmap(context, ptr, range_size); +} + +pi_result piextVirtualMemSetAccess(pi_context context, const void *ptr, + size_t range_size, + pi_virtual_access_flags flags) { + return pi2ur::piextVirtualMemSetAccess(context, ptr, range_size, flags); +} + +pi_result piextVirtualMemGetInfo(pi_context context, const void *ptr, + size_t range_size, + pi_virtual_mem_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) { + return pi2ur::piextVirtualMemGetInfo(context, ptr, range_size, param_name, + param_value_size, param_value, + param_value_size_ret); +} + const char SupportedVersion[] = _PI_HIP_PLUGIN_VERSION_STRING; pi_result piPluginInit(pi_plugin *PluginInit) { diff --git a/sycl/plugins/hip/pi_hip.hpp b/sycl/plugins/hip/pi_hip.hpp index 018d069f5fe7f..bec26c9866fdb 100644 --- a/sycl/plugins/hip/pi_hip.hpp +++ b/sycl/plugins/hip/pi_hip.hpp @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -94,4 +95,8 @@ struct _pi_ext_command_buffer : ur_exp_command_buffer_handle_t_ { using ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_; }; +struct _pi_physical_mem : ur_physical_mem_handle_t_ { + using ur_physical_mem_handle_t_::ur_physical_mem_handle_t_; +}; + #endif // PI_HIP_HPP diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index f88e8c1ed3cd3..bab365effe85f 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -1424,6 +1424,144 @@ piextCommandBufferReleaseCommand(pi_ext_command_buffer_command Command) { return pi2ur::piextCommandBufferReleaseCommand(Command); } +/// API for getting information about the minimum and recommended granularity +/// of physical and virtual memory. +/// +/// \param Context is the context to get the granularity from. +/// \param Device is the device to get the granularity from. +/// \param MemSize is the potentially unadjusted size to get granularity for. +/// \param ParamName is the type of query to perform. +/// \param ParamValueSize is the size of the result in bytes. +/// \param ParamValue is the result. +/// \param ParamValueSizeRet is how many bytes were written. +pi_result +piextVirtualMemGranularityGetInfo(pi_context Context, pi_device Device, + pi_virtual_mem_granularity_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + return pi2ur::piextVirtualMemGranularityGetInfo(Context, Device, ParamName, + ParamValueSize, ParamValue, + ParamValueSizeRet); +} + +/// API for creating a physical memory handle that virtual memory can be mapped +/// to. +/// +/// \param Context is the context within which the physical memory is allocated. +/// \param Device is the device the physical memory is on. +/// \param MemSize is the size of physical memory to allocate. This must be a +/// multiple of the minimum virtual memory granularity. +/// \param RetPhysicalMem is the handle for the resulting physical memory. +pi_result piextPhysicalMemCreate(pi_context Context, pi_device Device, + size_t MemSize, + pi_physical_mem *RetPhysicalMem) { + return pi2ur::piextPhysicalMemCreate(Context, Device, MemSize, + RetPhysicalMem); +} + +/// API for retaining a physical memory handle. +/// +/// \param PhysicalMem is the handle for the physical memory to retain. +pi_result piextPhysicalMemRetain(pi_physical_mem PhysicalMem) { + return pi2ur::piextPhysicalMemRetain(PhysicalMem); +} + +/// API for releasing a physical memory handle. +/// +/// \param PhysicalMem is the handle for the physical memory to free. +pi_result piextPhysicalMemRelease(pi_physical_mem PhysicalMem) { + return pi2ur::piextPhysicalMemRelease(PhysicalMem); +} + +/// API for reserving a virtual memory range. +/// +/// \param Context is the context within which the virtual memory range is +/// reserved. +/// \param Start is a pointer to the start of the region to reserve. If nullptr +/// the implementation selects a start address. +/// \param RangeSize is the size of the virtual address range to reserve in +/// bytes. +/// \param RetPtr is the pointer to the start of the resulting virtual memory +/// range. +pi_result piextVirtualMemReserve(pi_context Context, const void *Start, + size_t RangeSize, void **RetPtr) { + return pi2ur::piextVirtualMemReserve(Context, Start, RangeSize, RetPtr); +} + +/// API for freeing a virtual memory range. +/// +/// \param Context is the context within which the virtual memory range is +/// reserved. +/// \param Ptr is the pointer to the start of the virtual memory range. +/// \param RangeSize is the size of the virtual address range. +pi_result piextVirtualMemFree(pi_context Context, const void *Ptr, + size_t RangeSize) { + return pi2ur::piextVirtualMemFree(Context, Ptr, RangeSize); +} + +/// API for mapping a virtual memory range to a a physical memory allocation at +/// a given offset. +/// +/// \param Context is the context within which both the virtual memory range is +/// reserved and the physical memory is allocated. +/// \param Ptr is the pointer to the start of the virtual memory range. +/// \param RangeSize is the size of the virtual address range. +/// \param PhysicalMem is the handle for the physical memory to map Ptr to. +/// \param Offset is the offset into PhysicalMem in bytes to map Ptr to. +/// \param Flags is the access flags to set for the mapping. +pi_result piextVirtualMemMap(pi_context Context, const void *Ptr, + size_t RangeSize, pi_physical_mem PhysicalMem, + size_t Offset, pi_virtual_access_flags Flags) { + return pi2ur::piextVirtualMemMap(Context, Ptr, RangeSize, PhysicalMem, Offset, + Flags); +} + +/// API for unmapping a virtual memory range previously mapped in a context. +/// After a call to this function, the virtual memory range is left in a state +/// ready to be remapped. +/// +/// \param Context is the context within which the virtual memory range is +/// currently mapped. +/// \param Ptr is the pointer to the start of the virtual memory range. +/// \param RangeSize is the size of the virtual address range in bytes. +pi_result piextVirtualMemUnmap(pi_context Context, const void *Ptr, + size_t RangeSize) { + return pi2ur::piextVirtualMemUnmap(Context, Ptr, RangeSize); +} + +/// API for setting the access mode of a mapped virtual memory range. +/// +/// \param Context is the context within which the virtual memory range is +/// currently mapped. +/// \param Ptr is the pointer to the start of the virtual memory range. +/// \param RangeSize is the size of the virtual address range in bytes. +/// \param Flags is the access flags to set for the mapped virtual access range. +pi_result piextVirtualMemSetAccess(pi_context Context, const void *Ptr, + size_t RangeSize, + pi_virtual_access_flags Flags) { + return pi2ur::piextVirtualMemSetAccess(Context, Ptr, RangeSize, Flags); +} + +/// API for getting info about a mapped virtual memory range. +/// +/// \param Context is the context within which the virtual memory range is +/// currently mapped. +/// \param Ptr is the pointer to the start of the virtual memory range. +/// \param RangeSize is the size of the virtual address range in bytes. +/// \param ParamName is the type of query to perform. +/// \param ParamValueSize is the size of the result in bytes. +/// \param ParamValue is the result. +/// \param ParamValueSizeRet is how many bytes were written. +pi_result piextVirtualMemGetInfo(pi_context Context, const void *Ptr, + size_t RangeSize, + pi_virtual_mem_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + return pi2ur::piextVirtualMemGetInfo(Context, Ptr, RangeSize, ParamName, + ParamValueSize, ParamValue, + ParamValueSizeRet); +} + const char SupportedVersion[] = _PI_LEVEL_ZERO_PLUGIN_VERSION_STRING; pi_result piPluginInit(pi_plugin *PluginInit) { // missing diff --git a/sycl/plugins/native_cpu/pi_native_cpu.cpp b/sycl/plugins/native_cpu/pi_native_cpu.cpp index d867caea5e23d..2276e9f78f7ea 100644 --- a/sycl/plugins/native_cpu/pi_native_cpu.cpp +++ b/sycl/plugins/native_cpu/pi_native_cpu.cpp @@ -1321,6 +1321,69 @@ pi_result piextKernelSuggestMaxCooperativeGroupCount( return PI_ERROR_UNSUPPORTED_FEATURE; } +pi_result +piextVirtualMemGranularityGetInfo(pi_context context, pi_device device, + pi_virtual_mem_granularity_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) { + return pi2ur::piextVirtualMemGranularityGetInfo(context, device, param_name, + param_value_size, param_value, + param_value_size_ret); +} + +pi_result piextPhysicalMemCreate(pi_context context, pi_device device, + size_t mem_size, + pi_physical_mem *ret_physical_mem) { + return pi2ur::piextPhysicalMemCreate(context, device, mem_size, + ret_physical_mem); +} + +pi_result piextPhysicalMemRetain(pi_physical_mem physical_mem) { + return pi2ur::piextPhysicalMemRetain(physical_mem); +} + +pi_result piextPhysicalMemRelease(pi_physical_mem physical_mem) { + return pi2ur::piextPhysicalMemRelease(physical_mem); +} + +pi_result piextVirtualMemReserve(pi_context context, const void *start, + size_t range_size, void **ret_ptr) { + return pi2ur::piextVirtualMemReserve(context, start, range_size, ret_ptr); +} + +pi_result piextVirtualMemFree(pi_context context, const void *ptr, + size_t range_size) { + return pi2ur::piextVirtualMemFree(context, ptr, range_size); +} + +pi_result piextVirtualMemMap(pi_context context, const void *ptr, + size_t range_size, pi_physical_mem physical_mem, + size_t offset, pi_virtual_access_flags flags) { + return pi2ur::piextVirtualMemMap(context, ptr, range_size, physical_mem, + offset, flags); +} + +pi_result piextVirtualMemUnmap(pi_context context, const void *ptr, + size_t range_size) { + return pi2ur::piextVirtualMemUnmap(context, ptr, range_size); +} + +pi_result piextVirtualMemSetAccess(pi_context context, const void *ptr, + size_t range_size, + pi_virtual_access_flags flags) { + return pi2ur::piextVirtualMemSetAccess(context, ptr, range_size, flags); +} + +pi_result piextVirtualMemGetInfo(pi_context context, const void *ptr, + size_t range_size, + pi_virtual_mem_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) { + return pi2ur::piextVirtualMemGetInfo(context, ptr, range_size, param_name, + param_value_size, param_value, + param_value_size_ret); +} + // Initialize function table with stubs. #define _PI_API(api) \ (PluginInit->PiFunctionTable).api = (decltype(&::api))(&api); diff --git a/sycl/plugins/native_cpu/pi_native_cpu.hpp b/sycl/plugins/native_cpu/pi_native_cpu.hpp index 1d92580997b76..287b3c03115b6 100644 --- a/sycl/plugins/native_cpu/pi_native_cpu.hpp +++ b/sycl/plugins/native_cpu/pi_native_cpu.hpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -43,3 +44,7 @@ struct _pi_program : ur_program_handle_t_ { struct _pi_queue : ur_queue_handle_t_ { using ur_queue_handle_t_::ur_queue_handle_t_; }; + +struct _pi_physical_mem : ur_physical_mem_handle_t_ { + using ur_physical_mem_handle_t_::ur_physical_mem_handle_t_; +}; diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 1fef329d179af..1d340b5685f4e 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -1228,6 +1228,69 @@ pi_result piextPeerAccessGetInfo(pi_device command_device, ParamValueSizeRet); } +pi_result +piextVirtualMemGranularityGetInfo(pi_context Context, pi_device Device, + pi_virtual_mem_granularity_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + return pi2ur::piextVirtualMemGranularityGetInfo(Context, Device, ParamName, + ParamValueSize, ParamValue, + ParamValueSizeRet); +} + +pi_result piextPhysicalMemCreate(pi_context Context, pi_device Device, + size_t MemSize, + pi_physical_mem *RetPhysicalMem) { + return pi2ur::piextPhysicalMemCreate(Context, Device, MemSize, + RetPhysicalMem); +} + +pi_result piextPhysicalMemRetain(pi_physical_mem PhysicalMem) { + return pi2ur::piextPhysicalMemRetain(PhysicalMem); +} + +pi_result piextPhysicalMemRelease(pi_physical_mem PhysicalMem) { + return pi2ur::piextPhysicalMemRelease(PhysicalMem); +} + +pi_result piextVirtualMemReserve(pi_context Context, const void *Start, + size_t RangeSize, void **RetPtr) { + return pi2ur::piextVirtualMemReserve(Context, Start, RangeSize, RetPtr); +} + +pi_result piextVirtualMemFree(pi_context Context, const void *Ptr, + size_t RangeSize) { + return pi2ur::piextVirtualMemFree(Context, Ptr, RangeSize); +} + +pi_result piextVirtualMemMap(pi_context Context, const void *Ptr, + size_t RangeSize, pi_physical_mem PhysicalMem, + size_t Offset, pi_virtual_access_flags Flags) { + return pi2ur::piextVirtualMemMap(Context, Ptr, RangeSize, PhysicalMem, Offset, + Flags); +} + +pi_result piextVirtualMemUnmap(pi_context Context, const void *Ptr, + size_t RangeSize) { + return pi2ur::piextVirtualMemUnmap(Context, Ptr, RangeSize); +} + +pi_result piextVirtualMemSetAccess(pi_context Context, const void *Ptr, + size_t RangeSize, + pi_virtual_access_flags Flags) { + return pi2ur::piextVirtualMemSetAccess(Context, Ptr, RangeSize, Flags); +} + +pi_result piextVirtualMemGetInfo(pi_context Context, const void *Ptr, + size_t RangeSize, + pi_virtual_mem_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + return pi2ur::piextVirtualMemGetInfo(Context, Ptr, RangeSize, ParamName, + ParamValueSize, ParamValue, + ParamValueSizeRet); +} + pi_result piTearDown(void *PluginParameter) { return pi2ur::piTearDown(PluginParameter); } diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 3ee63a025593b..f22e672d84423 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -678,6 +678,31 @@ inline pi_result ur2piSamplerInfoValue(ur_sampler_info_t ParamName, } } +inline pi_result ur2piVirtualMemInfoValue(ur_virtual_mem_info_t ParamName, + size_t ParamValueSizePI, + size_t *ParamValueSizeUR, + void *ParamValue) { + + ConvertHelper Value(ParamValueSizePI, ParamValue, ParamValueSizeUR); + switch (ParamName) { + case UR_VIRTUAL_MEM_INFO_ACCESS_MODE: { + auto ConvertFunc = [](ur_virtual_mem_access_flags_t UrValue) { + pi_virtual_access_flags PiValue = 0; + if (UrValue & UR_VIRTUAL_MEM_ACCESS_FLAG_READ_WRITE) + PiValue |= PI_VIRTUAL_ACCESS_FLAG_RW; + if (UrValue & UR_VIRTUAL_MEM_ACCESS_FLAG_READ_ONLY) + PiValue |= PI_VIRTUAL_ACCESS_FLAG_READ_ONLY; + return PiValue; + }; + return Value + .convert( + ConvertFunc); + } + default: + return PI_SUCCESS; + } +} + // Translate UR device info values to PI info values inline pi_result ur2piUSMAllocInfoValue(ur_usm_alloc_info_t ParamName, size_t ParamValueSizePI, @@ -1311,6 +1336,8 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, PI_TO_UR_MAP_DEVICE_INFO( PI_EXT_ONEAPI_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT, UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP) + PI_TO_UR_MAP_DEVICE_INFO(PI_EXT_ONEAPI_DEVICE_INFO_SUPPORTS_VIRTUAL_MEM, + UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT) #undef PI_TO_UR_MAP_DEVICE_INFO default: return PI_ERROR_UNKNOWN; @@ -5665,4 +5692,194 @@ inline pi_result piextSignalExternalSemaphore( // Bindless Images Extension /////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// +// Virtual Memory + +inline pi_result +piextVirtualMemGranularityGetInfo(pi_context Context, pi_device Device, + pi_virtual_mem_granularity_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); + PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE); + + ur_context_handle_t UrContext = + reinterpret_cast(Context); + ur_device_handle_t UrDevice = reinterpret_cast(Device); + + ur_virtual_mem_granularity_info_t InfoType{}; + switch (ParamName) { + case PI_EXT_ONEAPI_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM: + InfoType = UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM; + break; + case PI_EXT_ONEAPI_VIRTUAL_MEM_GRANULARITY_INFO_RECOMMENDED: + InfoType = UR_VIRTUAL_MEM_GRANULARITY_INFO_RECOMMENDED; + break; + default: + return PI_ERROR_UNKNOWN; + } + + HANDLE_ERRORS(urVirtualMemGranularityGetInfo(UrContext, UrDevice, InfoType, + ParamValueSize, ParamValue, + ParamValueSizeRet)); + + return PI_SUCCESS; +} + +inline pi_result piextPhysicalMemCreate(pi_context Context, pi_device Device, + size_t MemSize, + pi_physical_mem *RetPhyscialMem) { + PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); + PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE); + + ur_context_handle_t UrContext = + reinterpret_cast(Context); + ur_device_handle_t UrDevice = reinterpret_cast(Device); + + ur_physical_mem_handle_t *UrPhysicalMem = + reinterpret_cast(RetPhyscialMem); + + HANDLE_ERRORS(urPhysicalMemCreate(UrContext, UrDevice, MemSize, nullptr, + UrPhysicalMem)); + + return PI_SUCCESS; +} + +inline pi_result piextPhysicalMemRetain(pi_physical_mem PhysicalMem) { + PI_ASSERT(PhysicalMem, PI_ERROR_INVALID_ARG_VALUE); + + ur_physical_mem_handle_t UrPhysicalMem = + reinterpret_cast(PhysicalMem); + + HANDLE_ERRORS(urPhysicalMemRetain(UrPhysicalMem)); + + return PI_SUCCESS; +} + +inline pi_result piextPhysicalMemRelease(pi_physical_mem PhysicalMem) { + + ur_physical_mem_handle_t UrPhysicalMem = + reinterpret_cast(PhysicalMem); + + HANDLE_ERRORS(urPhysicalMemRelease(UrPhysicalMem)); + + return PI_SUCCESS; +} + +inline pi_result piextVirtualMemReserve(pi_context Context, const void *Start, + size_t RangeSize, void **RetPtr) { + PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); + PI_ASSERT(RetPtr, PI_ERROR_INVALID_ARG_VALUE); + + ur_context_handle_t UrContext = + reinterpret_cast(Context); + + HANDLE_ERRORS(urVirtualMemReserve(UrContext, Start, RangeSize, RetPtr)); + + return PI_SUCCESS; +} + +inline pi_result piextVirtualMemFree(pi_context Context, const void *Ptr, + size_t RangeSize) { + PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); + PI_ASSERT(Ptr, PI_ERROR_INVALID_ARG_VALUE); + + ur_context_handle_t UrContext = + reinterpret_cast(Context); + + HANDLE_ERRORS(urVirtualMemFree(UrContext, Ptr, RangeSize)); + + return PI_SUCCESS; +} + +inline pi_result piextVirtualMemSetAccess(pi_context Context, const void *Ptr, + size_t RangeSize, + pi_virtual_access_flags Flags) { + PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); + PI_ASSERT(Ptr, PI_ERROR_INVALID_ARG_VALUE); + + ur_context_handle_t UrContext = + reinterpret_cast(Context); + + ur_virtual_mem_access_flags_t UrFlags = 0; + if (Flags & PI_VIRTUAL_ACCESS_FLAG_RW) + UrFlags |= UR_VIRTUAL_MEM_ACCESS_FLAG_READ_WRITE; + if (Flags & PI_VIRTUAL_ACCESS_FLAG_READ_ONLY) + UrFlags |= UR_VIRTUAL_MEM_ACCESS_FLAG_READ_ONLY; + + HANDLE_ERRORS(urVirtualMemSetAccess(UrContext, Ptr, RangeSize, UrFlags)); + + return PI_SUCCESS; +} + +inline pi_result piextVirtualMemMap(pi_context Context, const void *Ptr, + size_t RangeSize, + pi_physical_mem PhysicalMem, size_t Offset, + pi_virtual_access_flags Flags) { + PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); + PI_ASSERT(Ptr, PI_ERROR_INVALID_ARG_VALUE); + PI_ASSERT(PhysicalMem, PI_ERROR_INVALID_ARG_VALUE); + + ur_context_handle_t UrContext = + reinterpret_cast(Context); + ur_physical_mem_handle_t UrPhysicalMem = + reinterpret_cast(PhysicalMem); + + ur_virtual_mem_access_flags_t UrFlags = 0; + if (Flags & PI_VIRTUAL_ACCESS_FLAG_RW) + UrFlags |= UR_VIRTUAL_MEM_ACCESS_FLAG_READ_WRITE; + if (Flags & PI_VIRTUAL_ACCESS_FLAG_READ_ONLY) + UrFlags |= UR_VIRTUAL_MEM_ACCESS_FLAG_READ_ONLY; + + HANDLE_ERRORS(urVirtualMemMap(UrContext, Ptr, RangeSize, UrPhysicalMem, + Offset, UrFlags)); + + return PI_SUCCESS; +} + +inline pi_result piextVirtualMemUnmap(pi_context Context, const void *Ptr, + size_t RangeSize) { + PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); + PI_ASSERT(Ptr, PI_ERROR_INVALID_ARG_VALUE); + + ur_context_handle_t UrContext = + reinterpret_cast(Context); + + HANDLE_ERRORS(urVirtualMemUnmap(UrContext, Ptr, RangeSize)); + + return PI_SUCCESS; +} + +inline pi_result piextVirtualMemGetInfo(pi_context Context, const void *Ptr, + size_t RangeSize, + pi_virtual_mem_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); + PI_ASSERT(Ptr, PI_ERROR_INVALID_ARG_VALUE); + + ur_context_handle_t UrContext = + reinterpret_cast(Context); + + ur_virtual_mem_info_t InfoType{}; + switch (ParamName) { + case PI_EXT_ONEAPI_VIRTUAL_MEM_INFO_ACCESS_MODE: + InfoType = UR_VIRTUAL_MEM_INFO_ACCESS_MODE; + break; + default: + return PI_ERROR_UNKNOWN; + } + + HANDLE_ERRORS(urVirtualMemGetInfo(UrContext, Ptr, RangeSize, InfoType, + ParamValueSize, ParamValue, + ParamValueSizeRet)); + ur2piVirtualMemInfoValue(InfoType, ParamValueSize, &ParamValueSize, + ParamValue); + + return PI_SUCCESS; +} + +// Virtual Memory +/////////////////////////////////////////////////////////////////////////////// + } // namespace pi2ur diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp index 30ba9a7afc8b1..7e268199bba77 100644 --- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp +++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp @@ -1189,6 +1189,72 @@ piextCommandBufferReleaseCommand(pi_ext_command_buffer_command Command) { return pi2ur::piextCommandBufferReleaseCommand(Command); } +__SYCL_EXPORT pi_result piextVirtualMemGranularityGetInfo( + pi_context Context, pi_device Device, + pi_virtual_mem_granularity_info ParamName, size_t ParamValueSize, + void *ParamValue, size_t *ParamValueSizeRet) { + return pi2ur::piextVirtualMemGranularityGetInfo(Context, Device, ParamName, + ParamValueSize, ParamValue, + ParamValueSizeRet); +} + +__SYCL_EXPORT pi_result +piextPhysicalMemCreate(pi_context Context, pi_device Device, size_t MemSize, + pi_physical_mem *RetPhsycialMem) { + return pi2ur::piextPhysicalMemCreate(Context, Device, MemSize, + RetPhsycialMem); +} + +__SYCL_EXPORT pi_result piextPhysicalMemRetain(pi_physical_mem PhysicalMem) { + return pi2ur::piextPhysicalMemRetain(PhysicalMem); +} + +__SYCL_EXPORT pi_result piextPhysicalMemRelease(pi_physical_mem PhysicalMem) { + return pi2ur::piextPhysicalMemRelease(PhysicalMem); +} + +__SYCL_EXPORT pi_result piextVirtualMemReserve(pi_context Context, + const void *Start, + size_t RangeSize, + void **RetPtr) { + return pi2ur::piextVirtualMemReserve(Context, Start, RangeSize, RetPtr); +} + +__SYCL_EXPORT pi_result piextVirtualMemFree(pi_context Context, const void *Ptr, + size_t RangeSize) { + return pi2ur::piextVirtualMemFree(Context, Ptr, RangeSize); +} + +__SYCL_EXPORT pi_result +piextVirtualMemSetAccess(pi_context Context, const void *Ptr, size_t RangeSize, + pi_virtual_access_flags Flags) { + return pi2ur::piextVirtualMemSetAccess(Context, Ptr, RangeSize, Flags); +} + +__SYCL_EXPORT pi_result piextVirtualMemMap(pi_context Context, const void *Ptr, + size_t RangeSize, + pi_physical_mem PhysicalMem, + size_t Offset, + pi_virtual_access_flags Flags) { + return pi2ur::piextVirtualMemMap(Context, Ptr, RangeSize, PhysicalMem, Offset, + Flags); +} + +__SYCL_EXPORT pi_result piextVirtualMemUnmap(pi_context Context, + const void *Ptr, + size_t RangeSize) { + return pi2ur::piextVirtualMemUnmap(Context, Ptr, RangeSize); +} + +__SYCL_EXPORT pi_result +piextVirtualMemGetInfo(pi_context Context, const void *Ptr, size_t RangeSize, + pi_virtual_mem_info ParamName, size_t ParamValueSize, + void *ParamValue, size_t *ParamValueSizeRet) { + return pi2ur::piextVirtualMemGetInfo(Context, Ptr, RangeSize, ParamName, + ParamValueSize, ParamValue, + ParamValueSizeRet); +} + __SYCL_EXPORT pi_result piGetDeviceAndHostTimer(pi_device Device, uint64_t *DeviceTime, uint64_t *HostTime) { diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt index 74497db20c9f1..f915ef4e2cb8e 100644 --- a/sycl/source/CMakeLists.txt +++ b/sycl/source/CMakeLists.txt @@ -257,11 +257,13 @@ set(SYCL_COMMON_SOURCES "interop_handle.cpp" "kernel.cpp" "kernel_bundle.cpp" + "physical_mem.cpp" "platform.cpp" "queue.cpp" "sampler.cpp" "stream.cpp" "spirv_ops.cpp" + "virtual_mem.cpp" "$<$:detail/windows_pi.cpp>" "$<$,$>:detail/posix_pi.cpp>" ) diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp index 6e2b69850d5e1..3295188c295ba 100644 --- a/sycl/source/detail/device_impl.cpp +++ b/sycl/source/detail/device_impl.cpp @@ -718,6 +718,14 @@ bool device_impl::has(aspect Aspect) const { sizeof(pi_bool), &support, nullptr) == PI_SUCCESS; return call_successful && support; } + case aspect::ext_oneapi_virtual_mem: { + pi_bool support = PI_FALSE; + bool call_successful = + getPlugin()->call_nocheck( + MDevice, PI_EXT_ONEAPI_DEVICE_INFO_SUPPORTS_VIRTUAL_MEM, + sizeof(pi_bool), &support, nullptr) == PI_SUCCESS; + return call_successful && support; + } } return false; // This device aspect has not been implemented yet. diff --git a/sycl/source/detail/physical_mem_impl.hpp b/sycl/source/detail/physical_mem_impl.hpp new file mode 100644 index 0000000000000..9fb38f1202257 --- /dev/null +++ b/sycl/source/detail/physical_mem_impl.hpp @@ -0,0 +1,95 @@ +//==- physical_mem_impl.hpp - sycl_ext_oneapi_virtual_mem physical_mem impl ==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace sycl { +inline namespace _V1 { +namespace detail { + +inline sycl::detail::pi::PiVirtualAccessFlags AccessModeToVirtualAccessFlags( + ext::oneapi::experimental::address_access_mode Mode) { + switch (Mode) { + case ext::oneapi::experimental::address_access_mode::read: + return PI_VIRTUAL_ACCESS_FLAG_READ_ONLY; + case ext::oneapi::experimental::address_access_mode::read_write: + return PI_VIRTUAL_ACCESS_FLAG_RW; + case ext::oneapi::experimental::address_access_mode::none: + return 0; + default: + throw sycl::exception(make_error_code(errc::invalid), + "Invalid address_access_mode."); + } +} + +class physical_mem_impl { +public: + physical_mem_impl(const device &SyclDevice, const context &SyclContext, + size_t NumBytes) + : MDevice(getSyclObjImpl(SyclDevice)), + MContext(getSyclObjImpl(SyclContext)), MNumBytes(NumBytes) { + const PluginPtr &Plugin = MContext->getPlugin(); + + auto Err = Plugin->call_nocheck( + MContext->getHandleRef(), MDevice->getHandleRef(), MNumBytes, + &MPhysicalMem); + + if (Err == PI_ERROR_OUT_OF_RESOURCES || Err == PI_ERROR_OUT_OF_HOST_MEMORY) + throw sycl::exception(make_error_code(errc::memory_allocation), + "Failed to allocate physical memory."); + Plugin->checkPiResult(Err); + } + + ~physical_mem_impl() noexcept(false) { + const PluginPtr &Plugin = MContext->getPlugin(); + Plugin->call(MPhysicalMem); + } + + void *map(uintptr_t Ptr, size_t NumBytes, + ext::oneapi::experimental::address_access_mode Mode, + size_t Offset) const { + sycl::detail::pi::PiVirtualAccessFlags AccessFlags = + AccessModeToVirtualAccessFlags(Mode); + const PluginPtr &Plugin = MContext->getPlugin(); + void *ResultPtr = reinterpret_cast(Ptr); + Plugin->call( + MContext->getHandleRef(), ResultPtr, NumBytes, MPhysicalMem, Offset, + AccessFlags); + return ResultPtr; + } + + context get_context() const { + return createSyclObjFromImpl(MContext); + } + device get_device() const { return createSyclObjFromImpl(MDevice); } + size_t size() const noexcept { return MNumBytes; } + + sycl::detail::pi::PiPhysicalMem &getHandleRef() { return MPhysicalMem; } + const sycl::detail::pi::PiPhysicalMem &getHandleRef() const { + return MPhysicalMem; + } + +private: + sycl::detail::pi::PiPhysicalMem MPhysicalMem = nullptr; + const std::shared_ptr MDevice; + const std::shared_ptr MContext; + const size_t MNumBytes; +}; + +} // namespace detail +} // namespace _V1 +} // namespace sycl diff --git a/sycl/source/feature_test.hpp.in b/sycl/source/feature_test.hpp.in index ce88520fe50dd..f7e023c718462 100644 --- a/sycl/source/feature_test.hpp.in +++ b/sycl/source/feature_test.hpp.in @@ -86,6 +86,7 @@ inline namespace _V1 { #define SYCL_EXT_ONEAPI_ANNOTATED_ARG 1 #define SYCL_EXT_ONEAPI_ANNOTATED_PTR 1 #define SYCL_EXT_ONEAPI_COPY_OPTIMIZE 1 +#define SYCL_EXT_ONEAPI_VIRTUAL_MEM 1 #define SYCL_EXT_ONEAPI_USM_MALLOC_PROPERTIES 1 #cmakedefine01 SYCL_ENABLE_KERNEL_FUSION #if SYCL_ENABLE_KERNEL_FUSION diff --git a/sycl/source/physical_mem.cpp b/sycl/source/physical_mem.cpp new file mode 100644 index 0000000000000..d9d6073a68e89 --- /dev/null +++ b/sycl/source/physical_mem.cpp @@ -0,0 +1,38 @@ +//==--- physical_mem.cpp - sycl_ext_oneapi_virtual_mem physical_mem class --==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +namespace sycl { +inline namespace _V1 { +namespace ext::oneapi::experimental { + +physical_mem::physical_mem(const device &SyclDevice, const context &SyclContext, + size_t NumBytes) { + if (!SyclDevice.has(aspect::ext_oneapi_virtual_mem)) + throw sycl::exception( + sycl::make_error_code(sycl::errc::feature_not_supported), + "Device does not support aspect::ext_oneapi_virtual_mem."); + + impl = std::make_shared( + SyclDevice, SyclContext, NumBytes); +} + +void *physical_mem::map(uintptr_t Ptr, size_t NumBytes, + address_access_mode Mode, size_t Offset) const { + return impl->map(Ptr, NumBytes, Mode, Offset); +} + +context physical_mem::get_context() const { return impl->get_context(); } +device physical_mem::get_device() const { return impl->get_device(); } +size_t physical_mem::size() const noexcept { return impl->size(); } + +} // namespace ext::oneapi::experimental +} // namespace _V1 +} // namespace sycl diff --git a/sycl/source/virtual_mem.cpp b/sycl/source/virtual_mem.cpp new file mode 100644 index 0000000000000..8cdc5ffba0223 --- /dev/null +++ b/sycl/source/virtual_mem.cpp @@ -0,0 +1,183 @@ +//==- virtual_mem.cpp - sycl_ext_oneapi_virtual_mem virtual mem free funcs -==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +// System headers for querying page-size. +#ifdef _WIN32 +#include +#else +#include +#endif + +namespace sycl { +inline namespace _V1 { +namespace ext::oneapi::experimental { + +__SYCL_EXPORT size_t get_mem_granularity(const device &SyclDevice, + const context &SyclContext, + granularity_mode Mode) { + if (!SyclDevice.has(aspect::ext_oneapi_virtual_mem)) + throw sycl::exception( + sycl::make_error_code(sycl::errc::feature_not_supported), + "Device does not support aspect::ext_oneapi_virtual_mem."); + + pi_virtual_mem_granularity_info GranularityQuery = [=]() { + switch (Mode) { + case granularity_mode::minimum: + return PI_EXT_ONEAPI_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM; + case granularity_mode::recommended: + return PI_EXT_ONEAPI_VIRTUAL_MEM_GRANULARITY_INFO_RECOMMENDED; + } + throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), + "Unrecognized granularity mode."); + }(); + + std::shared_ptr DeviceImpl = + sycl::detail::getSyclObjImpl(SyclDevice); + std::shared_ptr ContextImpl = + sycl::detail::getSyclObjImpl(SyclContext); + const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin(); +#ifndef NDEBUG + size_t InfoOutputSize; + Plugin->call( + ContextImpl->getHandleRef(), DeviceImpl->getHandleRef(), GranularityQuery, + 0, nullptr, &InfoOutputSize); + assert(InfoOutputSize == sizeof(size_t) && + "Unexpected output size of granularity info query."); +#endif // NDEBUG + size_t Granularity = 0; + Plugin->call( + ContextImpl->getHandleRef(), DeviceImpl->getHandleRef(), GranularityQuery, + sizeof(size_t), &Granularity, nullptr); + return Granularity; +} + +__SYCL_EXPORT size_t get_mem_granularity(const context &SyclContext, + granularity_mode Mode) { + const std::vector Devices = SyclContext.get_devices(); + if (!std::all_of(Devices.cbegin(), Devices.cend(), [](const device &Dev) { + return Dev.has(aspect::ext_oneapi_virtual_mem); + })) { + throw sycl::exception( + sycl::make_error_code(sycl::errc::feature_not_supported), + "One or more devices in the context does not support " + "aspect::ext_oneapi_virtual_mem."); + } + + // CUDA only needs page-size granularity. + if (SyclContext.get_backend() == backend::ext_oneapi_cuda) { +#ifdef _WIN32 + SYSTEM_INFO SystemInfo; + GetSystemInfo(&SystemInfo); + return static_cast(SystemInfo.dwPageSize); +#else + return static_cast(sysconf(_SC_PAGESIZE)); +#endif + } + + // Otherwise, we find the least common multiple of granularity of the devices + // in the context. + size_t LCMGranularity = get_mem_granularity(Devices[0], SyclContext, Mode); + for (size_t I = 1; I < Devices.size(); ++I) { + size_t DevGranularity = get_mem_granularity(Devices[I], SyclContext, Mode); + size_t GCD = LCMGranularity; + size_t Rem = DevGranularity % GCD; + while (Rem != 0) { + std::swap(GCD, Rem); + Rem %= GCD; + } + LCMGranularity *= DevGranularity / GCD; + } + return LCMGranularity; +} + +__SYCL_EXPORT uintptr_t reserve_virtual_mem(uintptr_t Start, size_t NumBytes, + const context &SyclContext) { + std::vector Devs = SyclContext.get_devices(); + if (std::any_of(Devs.cbegin(), Devs.cend(), [](const device &Dev) { + return !Dev.has(aspect::ext_oneapi_virtual_mem); + })) + throw sycl::exception( + sycl::make_error_code(sycl::errc::feature_not_supported), + "One or more devices in the supplied context does not support " + "aspect::ext_oneapi_virtual_mem."); + + std::shared_ptr ContextImpl = + sycl::detail::getSyclObjImpl(SyclContext); + const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin(); + void *OutPtr = nullptr; + Plugin->call( + ContextImpl->getHandleRef(), reinterpret_cast(Start), NumBytes, + &OutPtr); + return reinterpret_cast(OutPtr); +} + +__SYCL_EXPORT void free_virtual_mem(uintptr_t Ptr, size_t NumBytes, + const context &SyclContext) { + std::shared_ptr ContextImpl = + sycl::detail::getSyclObjImpl(SyclContext); + const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin(); + Plugin->call( + ContextImpl->getHandleRef(), reinterpret_cast(Ptr), NumBytes); +} + +__SYCL_EXPORT void set_access_mode(const void *Ptr, size_t NumBytes, + address_access_mode Mode, + const context &SyclContext) { + sycl::detail::pi::PiVirtualAccessFlags AccessFlags = + sycl::detail::AccessModeToVirtualAccessFlags(Mode); + std::shared_ptr ContextImpl = + sycl::detail::getSyclObjImpl(SyclContext); + const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin(); + Plugin->call( + ContextImpl->getHandleRef(), Ptr, NumBytes, AccessFlags); +} + +__SYCL_EXPORT address_access_mode get_access_mode(const void *Ptr, + size_t NumBytes, + const context &SyclContext) { + std::shared_ptr ContextImpl = + sycl::detail::getSyclObjImpl(SyclContext); + const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin(); +#ifndef NDEBUG + size_t InfoOutputSize; + Plugin->call( + ContextImpl->getHandleRef(), Ptr, NumBytes, + PI_EXT_ONEAPI_VIRTUAL_MEM_INFO_ACCESS_MODE, 0, nullptr, &InfoOutputSize); + assert(InfoOutputSize == sizeof(sycl::detail::pi::PiVirtualAccessFlags) && + "Unexpected output size of access mode info query."); +#endif // NDEBUG + sycl::detail::pi::PiVirtualAccessFlags AccessFlags; + Plugin->call( + ContextImpl->getHandleRef(), Ptr, NumBytes, + PI_EXT_ONEAPI_VIRTUAL_MEM_INFO_ACCESS_MODE, + sizeof(sycl::detail::pi::PiVirtualAccessFlags), &AccessFlags, nullptr); + + if (AccessFlags & PI_VIRTUAL_ACCESS_FLAG_RW) + return address_access_mode::read_write; + if (AccessFlags & PI_VIRTUAL_ACCESS_FLAG_READ_ONLY) + return address_access_mode::read; + return address_access_mode::none; +} + +__SYCL_EXPORT void unmap(const void *Ptr, size_t NumBytes, + const context &SyclContext) { + std::shared_ptr ContextImpl = + sycl::detail::getSyclObjImpl(SyclContext); + const sycl::detail::PluginPtr &Plugin = ContextImpl->getPlugin(); + Plugin->call( + ContextImpl->getHandleRef(), Ptr, NumBytes); +} + +} // Namespace ext::oneapi::experimental +} // namespace _V1 +} // Namespace sycl diff --git a/sycl/test-e2e/VirtualMem/vector_with_virtual_mem.cpp b/sycl/test-e2e/VirtualMem/vector_with_virtual_mem.cpp new file mode 100644 index 0000000000000..cbbcf52e3ab25 --- /dev/null +++ b/sycl/test-e2e/VirtualMem/vector_with_virtual_mem.cpp @@ -0,0 +1,236 @@ +// REQUIRES: aspect-ext_oneapi_virtual_mem, usm_shared_allocations + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include +#include + +#include +#include +#include + +namespace syclext = sycl::ext::oneapi::experimental; + +// Find the least common multiple of the context and device granularities. This +// value can be used for aligning both physical memory allocations and for +// reserving virtual memory ranges. +size_t GetLCMGranularity(const sycl::device &Dev, const sycl::context &Ctx) { + size_t CtxGranularity = syclext::get_mem_granularity(MContext); + size_t DevGranularity = syclext::get_mem_granularity(MDevice, MContext); + + size_t GCD = CtxGranularity; + size_t Rem = DevGranularity % GCD; + while (Rem != 0) { + std::swap(GCD, Rem); + Rem %= GCD; + } + return (DevGranularity / GCD) * LCMGranularity; +} + +template class VirtualVector { +public: + VirtualVector(sycl::queue &Q) + : MDevice{Q.get_device()}, MContext{Q.get_context()}, + MGranularity{GetLCMGranularity(MDevice, MContext)} {}; + + ~VirtualVector() { + // Free all mapped ranges. + unmap_all(); + for (const VirtualAddressRange &VARange : MVARanges) + syclext::free_virtual_mem(VARange.Ptr, VARange.Size, MContext); + // Physical memory allocations will be freed when the physical_mem objects + // die with MPhysicalMemMappings. + } + + void reserve(size_t NewSize) { + // If we already have more memory than required, we can return. + size_t NewByteSize = sizeof(T) * NewSize; + if (NewByteSize <= MByteSize) { + MSize = NewSize; + return; + } + + // Align the size by the granularity. + size_t AlignedNewByteSize = AlignByteSize(NewByteSize); + size_t AlignedNewVARangeSize = AlignedNewByteSize - MByteSize; + + // Try to reserve virtual memory at the end of the existing one. + uintptr_t CurrentEnd = reinterpret_cast(MBasePtr) + MByteSize; + uintptr_t NewVAPtr = syclext::reserve_virtual_mem( + CurrentEnd, AlignedNewVARangeSize, MContext); + + // If we failed to get a ptr to the end of the current range, we need to + // recreate the whole range. + if (CurrentEnd && NewVAPtr != CurrentEnd) { + // First we need to free the virtual address range we just reserved. + syclext::free_virtual_mem(NewVAPtr, AlignedNewVARangeSize, MContext); + + // Recreate the full range and update the new VA ptr. CurrentEnd is no + // longer valid after this call. + NewVAPtr = RecreateAddressRange(AlignedNewByteSize); + } else { + // Otherwise we need to register the new range. + MVARanges.emplace_back(NewVAPtr, AlignedNewVARangeSize); + + // If there was no base pointer previously, this is now the new base. + if (!MBasePtr) + MBasePtr = reinterpret_cast(NewVAPtr); + } + + // Create new physical memory allocation and map the new range to it. + syclext::physical_mem NewPhysicalMem{MDevice, MContext, + AlignedNewVARangeSize}; + void *MappedPtr = + NewPhysicalMem.map(NewVAPtr, AlignedNewVARangeSize, + syclext::address_access_mode::read_write); + MPhysicalMemMappings.push_back( + std::make_pair(std::move(NewPhysicalMem), MappedPtr)); + + // Update the byte size of the vector. + MSize = NewSize; + MByteSize = AlignedNewByteSize; + } + + size_t size() const noexcept { return MSize; } + T *data() const noexcept { return MBasePtr; } + +private: + size_t AlignByteSize(size_t UnalignedByteSize) const { + return ((UnalignedByteSize + MGranularity - 1) / MGranularity) * + MGranularity; + } + + void unmap_all() { + for (std::pair &Mapping : + MPhysicalMemMappings) { + if (Mapping.second == 0) + continue; + syclext::unmap(Mapping.second, Mapping.first.size(), MContext); + Mapping.second = 0; + } + } + + uintptr_t RecreateAddressRange(size_t AlignedNewByteSize) { + // Reserve the full range. + uintptr_t NewFullVAPtr = + syclext::reserve_virtual_mem(AlignedNewByteSize, MContext); + + // Unmap the old virtual address ranges. + unmap_all(); + + // Remap all existing ranges. + uintptr_t NewEnd = NewFullVAPtr; + for (std::pair &Mapping : + MPhysicalMemMappings) { + Mapping.second = + Mapping.first.map(NewEnd, Mapping.first.size(), + syclext::address_access_mode::read_write); + NewEnd += Mapping.first.size(); + } + + // Free the old ranges. + for (const VirtualAddressRange &VARange : MVARanges) + syclext::free_virtual_mem(VARange.Ptr, VARange.Size, MContext); + + // Insert the newly reserved range to the saved ranges. + MVARanges.clear(); + MVARanges.emplace_back(NewFullVAPtr, AlignedNewByteSize); + + // Update the base pointer to point to the new start. + MBasePtr = reinterpret_cast(NewFullVAPtr); + + // Return the new end of the mapped ranges. + return NewEnd; + } + + struct VirtualAddressRange { + VirtualAddressRange(uintptr_t Ptr, size_t Size) : Ptr{Ptr}, Size{Size} {} + + uintptr_t Ptr; + size_t Size; + }; + + sycl::device MDevice; + sycl::context MContext; + + std::vector MVARanges; + std::vector> MPhysicalMemMappings; + + T *MBasePtr = nullptr; + size_t MSize = 0; + size_t MByteSize = 0; + + const size_t MGranularity = 0; +}; + +static constexpr size_t NumIters = 10; +static constexpr size_t WriteValueOffset = 42; +static constexpr size_t NumWorkItems = 512; + +int main() { + sycl::queue Q; + + VirtualVector Vec(Q); + + // To better test the functionality, try to allocate below the granularity + // but enough to require more memory for some iterations. + size_t SizeIncrement = 11; + size_t MinSizeGran = + syclext::get_mem_granularity(Q.get_device(), Q.get_context()) / + sizeof(int); + SizeIncrement = std::max(MinSizeGran / 2 - 1, SizeIncrement); + + // Each work-item will work on multiple elements. + size_t NumElemsPerWI = 1 + (SizeIncrement - 1) / NumWorkItems; + + for (size_t I = 0; I < NumIters; ++I) { + // Increment the size of the vector. + size_t NewVecSize = (I + 1) * SizeIncrement; + Vec.reserve(NewVecSize); + assert(Vec.size() == NewVecSize); + + // Populate to the new memory + int *VecDataPtr = Vec.data(); + size_t StartOffset = I * SizeIncrement; + size_t IterWriteValueOffset = WriteValueOffset * (I + 1); + Q.parallel_for(sycl::range<1>{NumWorkItems}, [=](sycl::item<1> Idx) { + for (size_t J = 0; J < NumElemsPerWI; ++J) { + size_t LoopIdx = J * Idx.get_range(0) + Idx; + size_t OffsetIdx = StartOffset + LoopIdx; + if (OffsetIdx < NewVecSize) + VecDataPtr[OffsetIdx] = LoopIdx + IterWriteValueOffset; + } + }).wait_and_throw(); + + // Copy back the values and verify. + int *CopyBack = sycl::malloc_shared(NewVecSize, Q); + + // TODO: Level-zero (excluding on PVC) does not currently allow copy across + // virtual memory ranges, even if they are consequtive. + syclext::architecture DevArch = + Q.get_device().get_info(); + if (Q.get_backend() == sycl::backend::ext_oneapi_level_zero && + DevArch != syclext::architecture::intel_gpu_pvc && + DevArch != syclext::architecture::intel_gpu_pvc_vg) { + Q.parallel_for(sycl::range<1>{NewVecSize}, [=](sycl::id<1> Idx) { + CopyBack[Idx] = VecDataPtr[Idx]; + }).wait_and_throw(); + } else { + Q.copy(VecDataPtr, CopyBack, NewVecSize).wait_and_throw(); + } + + for (size_t J = 0; J < NewVecSize; ++J) { + int ExpectedVal = + J % SizeIncrement + WriteValueOffset * (J / SizeIncrement + 1); + if (CopyBack[J] != ExpectedVal) { + std::cout << "Comparison failed at index " << J << ": " << CopyBack[J] + << " != " << ExpectedVal << std::endl; + return 1; + } + } + sycl::free(CopyBack, Q); + } + + return 0; +} diff --git a/sycl/test/abi/pi_cuda_symbol_check.dump b/sycl/test/abi/pi_cuda_symbol_check.dump index d3047c6bb1cd0..e6b19e97d1b87 100644 --- a/sycl/test/abi/pi_cuda_symbol_check.dump +++ b/sycl/test/abi/pi_cuda_symbol_check.dump @@ -146,6 +146,9 @@ piextMemSampledImageHandleDestroy piextMemUnsampledImageCreate piextMemUnsampledImageHandleDestroy piextPeerAccessGetInfo +piextPhysicalMemCreate +piextPhysicalMemRelease +piextPhysicalMemRetain piextPlatformCreateWithNativeHandle piextPlatformGetNativeHandle piextPluginGetOpaqueData @@ -171,4 +174,11 @@ piextUSMImport piextUSMPitchedAlloc piextUSMRelease piextUSMSharedAlloc +piextVirtualMemFree +piextVirtualMemGetInfo +piextVirtualMemGranularityGetInfo +piextVirtualMemMap +piextVirtualMemReserve +piextVirtualMemSetAccess +piextVirtualMemUnmap piextWaitExternalSemaphore diff --git a/sycl/test/abi/pi_hip_symbol_check.dump b/sycl/test/abi/pi_hip_symbol_check.dump index c83b4a4ba6122..530ad95722494 100644 --- a/sycl/test/abi/pi_hip_symbol_check.dump +++ b/sycl/test/abi/pi_hip_symbol_check.dump @@ -146,6 +146,9 @@ piextMemSampledImageHandleDestroy piextMemUnsampledImageCreate piextMemUnsampledImageHandleDestroy piextPeerAccessGetInfo +piextPhysicalMemCreate +piextPhysicalMemRelease +piextPhysicalMemRetain piextPlatformCreateWithNativeHandle piextPlatformGetNativeHandle piextPluginGetOpaqueData @@ -171,4 +174,11 @@ piextUSMImport piextUSMPitchedAlloc piextUSMRelease piextUSMSharedAlloc +piextVirtualMemFree +piextVirtualMemGetInfo +piextVirtualMemGranularityGetInfo +piextVirtualMemMap +piextVirtualMemReserve +piextVirtualMemSetAccess +piextVirtualMemUnmap piextWaitExternalSemaphore diff --git a/sycl/test/abi/pi_level_zero_symbol_check.dump b/sycl/test/abi/pi_level_zero_symbol_check.dump index d6cc82870c669..93cd4c4de10bb 100644 --- a/sycl/test/abi/pi_level_zero_symbol_check.dump +++ b/sycl/test/abi/pi_level_zero_symbol_check.dump @@ -145,6 +145,9 @@ piextMemSampledImageHandleDestroy piextMemUnsampledImageCreate piextMemUnsampledImageHandleDestroy piextPeerAccessGetInfo +piextPhysicalMemCreate +piextPhysicalMemRelease +piextPhysicalMemRetain piextPlatformCreateWithNativeHandle piextPlatformGetNativeHandle piextPluginGetOpaqueData @@ -170,4 +173,11 @@ piextUSMImport piextUSMPitchedAlloc piextUSMRelease piextUSMSharedAlloc +piextVirtualMemFree +piextVirtualMemGetInfo +piextVirtualMemGranularityGetInfo +piextVirtualMemMap +piextVirtualMemReserve +piextVirtualMemSetAccess +piextVirtualMemUnmap piextWaitExternalSemaphore diff --git a/sycl/test/abi/pi_nativecpu_symbol_check.dump b/sycl/test/abi/pi_nativecpu_symbol_check.dump index 850e6d22fdb72..c63f579ca6b53 100644 --- a/sycl/test/abi/pi_nativecpu_symbol_check.dump +++ b/sycl/test/abi/pi_nativecpu_symbol_check.dump @@ -146,6 +146,9 @@ piextMemSampledImageHandleDestroy piextMemUnsampledImageCreate piextMemUnsampledImageHandleDestroy piextPeerAccessGetInfo +piextPhysicalMemCreate +piextPhysicalMemRelease +piextPhysicalMemRetain piextPlatformCreateWithNativeHandle piextPlatformGetNativeHandle piextPluginGetOpaqueData @@ -171,4 +174,11 @@ piextUSMImport piextUSMPitchedAlloc piextUSMRelease piextUSMSharedAlloc +piextVirtualMemFree +piextVirtualMemGetInfo +piextVirtualMemGranularityGetInfo +piextVirtualMemMap +piextVirtualMemReserve +piextVirtualMemSetAccess +piextVirtualMemUnmap piextWaitExternalSemaphore diff --git a/sycl/test/abi/pi_opencl_symbol_check.dump b/sycl/test/abi/pi_opencl_symbol_check.dump index daaf7bbee5de5..8807d1647ebdc 100644 --- a/sycl/test/abi/pi_opencl_symbol_check.dump +++ b/sycl/test/abi/pi_opencl_symbol_check.dump @@ -133,6 +133,9 @@ piextMemGetNativeHandle piextMemImageAllocate piextMemImageCopy piextMemImageCreateWithNativeHandle +piextPhysicalMemCreate +piextPhysicalMemRelease +piextPhysicalMemRetain piextMemImageFree piextMemImageGetInfo piextMemImportOpaqueFD @@ -170,4 +173,11 @@ piextUSMImport piextUSMPitchedAlloc piextUSMRelease piextUSMSharedAlloc +piextVirtualMemFree +piextVirtualMemGetInfo +piextVirtualMemGranularityGetInfo +piextVirtualMemMap +piextVirtualMemReserve +piextVirtualMemSetAccess +piextVirtualMemUnmap piextWaitExternalSemaphore diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 2c97a01f87da7..99fb95d92fa72 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -2990,6 +2990,15 @@ _ZN4sycl3_V13ext5intel12experimental9pipe_base13get_pipe_nameB5cxx11EPKv _ZN4sycl3_V13ext5intel12experimental9pipe_base17wait_non_blockingERKNS0_5eventE _ZN4sycl3_V13ext6oneapi12experimental10mem_adviseENS0_5queueEPvmiRKNS0_6detail13code_locationE _ZN4sycl3_V13ext6oneapi10level_zero6detail11make_deviceERKNS0_8platformEm +_ZN4sycl3_V13ext6oneapi12experimental12physical_memC1ERKNS0_6deviceERKNS0_7contextEm +_ZN4sycl3_V13ext6oneapi12experimental12physical_memC2ERKNS0_6deviceERKNS0_7contextEm +_ZN4sycl3_V13ext6oneapi12experimental15get_access_modeEPKvmRKNS0_7contextE +_ZN4sycl3_V13ext6oneapi12experimental15set_access_modeEPKvmNS3_19address_access_modeERKNS0_7contextE +_ZN4sycl3_V13ext6oneapi12experimental16free_virtual_memEmmRKNS0_7contextE +_ZN4sycl3_V13ext6oneapi12experimental19get_mem_granularityERKNS0_6deviceERKNS0_7contextENS3_16granularity_modeE +_ZN4sycl3_V13ext6oneapi12experimental19get_mem_granularityERKNS0_7contextENS3_16granularity_modeE +_ZN4sycl3_V13ext6oneapi12experimental19reserve_virtual_memEmmRKNS0_7contextE +_ZN4sycl3_V13ext6oneapi12experimental5unmapEPKvmRKNS0_7contextE _ZN4sycl3_V13ext6oneapi12experimental12create_imageENS3_16image_mem_handleERKNS3_16image_descriptorERKNS0_5queueE _ZN4sycl3_V13ext6oneapi12experimental12create_imageENS3_16image_mem_handleERKNS3_16image_descriptorERKNS0_6deviceERKNS0_7contextE _ZN4sycl3_V13ext6oneapi12experimental12create_imageENS3_16image_mem_handleERKNS3_22bindless_image_samplerERKNS3_16image_descriptorERKNS0_5queueE @@ -3592,6 +3601,10 @@ _ZNK4sycl3_V114interop_handle16getNativeContextEv _ZNK4sycl3_V115device_selector13select_deviceEv _ZNK4sycl3_V116default_selectorclERKNS0_6deviceE _ZNK4sycl3_V120accelerator_selectorclERKNS0_6deviceE +_ZNK4sycl3_V13ext6oneapi12experimental12physical_mem10get_deviceEv +_ZNK4sycl3_V13ext6oneapi12experimental12physical_mem11get_contextEv +_ZNK4sycl3_V13ext6oneapi12experimental12physical_mem3mapEmmNS3_19address_access_modeEm +_ZNK4sycl3_V13ext6oneapi12experimental12physical_mem4sizeEv _ZNK4sycl3_V13ext6oneapi12experimental4node14get_successorsEv _ZNK4sycl3_V13ext6oneapi12experimental4node16get_predecessorsEv _ZNK4sycl3_V13ext6oneapi12experimental4node8get_typeEv diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index d02be89140c5a..c9eca1ecc8a25 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -609,6 +609,10 @@ ??0kernel_id@_V1@sycl@@AEAA@PEBD@Z ??0kernel_id@_V1@sycl@@QEAA@$$QEAV012@@Z ??0kernel_id@_V1@sycl@@QEAA@AEBV012@@Z +??0physical_mem@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV012345@@Z +??0physical_mem@experimental@oneapi@ext@_V1@sycl@@QEAA@$$QEAV012345@@Z +??0physical_mem@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBVqueue@45@_K@Z +??0physical_mem@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBVdevice@45@AEBVcontext@45@_K@Z ??0modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAA@AEBV?$shared_ptr@Vgraph_impl@detail@experimental@oneapi@ext@_V1@sycl@@@std@@@Z ??0modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@$$QEAV0123456@@Z ??0modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV0123456@@Z @@ -679,6 +683,7 @@ ??1kernel@_V1@sycl@@QEAA@XZ ??1kernel_bundle_plain@detail@_V1@sycl@@QEAA@XZ ??1kernel_id@_V1@sycl@@QEAA@XZ +??1physical_mem@experimental@oneapi@ext@_V1@sycl@@QEAA@XZ ??1modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@XZ ??1node@experimental@oneapi@ext@_V1@sycl@@QEAA@XZ ??1platform@_V1@sycl@@QEAA@XZ @@ -696,6 +701,8 @@ ??4?$OwnerLessBase@Vkernel@_V1@sycl@@@detail@_V1@sycl@@QEAAAEAV0123@AEBV0123@@Z ??4?$OwnerLessBase@Vkernel_id@_V1@sycl@@@detail@_V1@sycl@@QEAAAEAV0123@$$QEAV0123@@Z ??4?$OwnerLessBase@Vkernel_id@_V1@sycl@@@detail@_V1@sycl@@QEAAAEAV0123@AEBV0123@@Z +??4?$OwnerLessBase@Vphysical_mem@experimental@oneapi@ext@_V1@sycl@@@detail@_V1@sycl@@QEAAAEAV0123@$$QEAV0123@@Z +??4?$OwnerLessBase@Vphysical_mem@experimental@oneapi@ext@_V1@sycl@@@detail@_V1@sycl@@QEAAAEAV0123@AEBV0123@@Z ??4?$OwnerLessBase@Vplatform@_V1@sycl@@@detail@_V1@sycl@@QEAAAEAV0123@$$QEAV0123@@Z ??4?$OwnerLessBase@Vplatform@_V1@sycl@@@detail@_V1@sycl@@QEAAAEAV0123@AEBV0123@@Z ??4?$OwnerLessBase@Vqueue@_V1@sycl@@@detail@_V1@sycl@@QEAAAEAV0123@$$QEAV0123@@Z @@ -761,6 +768,8 @@ ??4kernel_bundle_plain@detail@_V1@sycl@@QEAAAEAV0123@AEBV0123@@Z ??4kernel_id@_V1@sycl@@QEAAAEAV012@$$QEAV012@@Z ??4kernel_id@_V1@sycl@@QEAAAEAV012@AEBV012@@Z +??4physical_mem@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV012345@$$QEAV012345@@Z +??4physical_mem@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV012345@AEBV012345@@Z ??4modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV0123456@$$QEAV0123456@@Z ??4modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV0123456@AEBV0123456@@Z ??4node@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV012345@$$QEAV012345@@Z @@ -782,6 +791,7 @@ ??8kernel@_V1@sycl@@QEBA_NAEBV012@@Z ??8kernel_bundle_plain@detail@_V1@sycl@@QEBA_NAEBV0123@@Z ??8kernel_id@_V1@sycl@@QEBA_NAEBV012@@Z +??8physical_mem@experimental@oneapi@ext@_V1@sycl@@QEBA_NAEBV012345@@Z ??8platform@_V1@sycl@@QEBA_NAEBV012@@Z ??8queue@_V1@sycl@@QEBA_NAEBV012@@Z ??8sampler@_V1@sycl@@QEBA_NAEBV012@@Z @@ -794,6 +804,7 @@ ??9kernel@_V1@sycl@@QEBA_NAEBV012@@Z ??9kernel_bundle_plain@detail@_V1@sycl@@QEBA_NAEBV0123@@Z ??9kernel_id@_V1@sycl@@QEBA_NAEBV012@@Z +??9physical_mem@experimental@oneapi@ext@_V1@sycl@@QEBA_NAEBV012345@@Z ??9platform@_V1@sycl@@QEBA_NAEBV012@@Z ??9queue@_V1@sycl@@QEBA_NAEBV012@@Z ??9sampler@_V1@sycl@@QEBA_NAEBV012@@Z @@ -4038,6 +4049,8 @@ ?ext_oneapi_owner_before@?$OwnerLessBase@Vkernel@_V1@sycl@@@detail@_V1@sycl@@QEBA_NAEBVkernel@34@@Z ?ext_oneapi_owner_before@?$OwnerLessBase@Vkernel_id@_V1@sycl@@@detail@_V1@sycl@@QEBA_NAEBV?$weak_object_base@Vkernel_id@_V1@sycl@@@2oneapi@ext@34@@Z ?ext_oneapi_owner_before@?$OwnerLessBase@Vkernel_id@_V1@sycl@@@detail@_V1@sycl@@QEBA_NAEBVkernel_id@34@@Z +?ext_oneapi_owner_before@?$OwnerLessBase@Vphysical_mem@experimental@oneapi@ext@_V1@sycl@@@detail@_V1@sycl@@QEBA_NAEBV?$weak_object_base@Vphysical_mem@experimental@oneapi@ext@_V1@sycl@@@2oneapi@ext@34@@Z +?ext_oneapi_owner_before@?$OwnerLessBase@Vphysical_mem@experimental@oneapi@ext@_V1@sycl@@@detail@_V1@sycl@@QEBA_NAEBVphysical_mem@experimental@oneapi@ext@34@@Z ?ext_oneapi_owner_before@?$OwnerLessBase@Vplatform@_V1@sycl@@@detail@_V1@sycl@@QEBA_NAEBV?$weak_object_base@Vplatform@_V1@sycl@@@2oneapi@ext@34@@Z ?ext_oneapi_owner_before@?$OwnerLessBase@Vplatform@_V1@sycl@@@detail@_V1@sycl@@QEBA_NAEBVplatform@34@@Z ?ext_oneapi_owner_before@?$OwnerLessBase@Vqueue@_V1@sycl@@@detail@_V1@sycl@@QEBA_NAEBV?$weak_object_base@Vqueue@_V1@sycl@@@2oneapi@ext@34@@Z @@ -4075,12 +4088,14 @@ ?find_device_intersection@detail@_V1@sycl@@YA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@AEBV?$vector@V?$kernel_bundle@$00@_V1@sycl@@V?$allocator@V?$kernel_bundle@$00@_V1@sycl@@@std@@@5@@Z ?free@_V1@sycl@@YAXPEAXAEBVcontext@12@AEBUcode_location@detail@12@@Z ?free@_V1@sycl@@YAXPEAXAEBVqueue@12@AEBUcode_location@detail@12@@Z -?free_image_mem@experimental@oneapi@ext@_V1@sycl@@YAXUimage_mem_handle@12345@AEBVdevice@45@AEBVcontext@45@@Z +?free_virtual_mem@experimental@oneapi@ext@_V1@sycl@@YAX_K0AEBVcontext@45@@Z ?free_image_mem@experimental@oneapi@ext@_V1@sycl@@YAXUimage_mem_handle@12345@AEBVqueue@45@@Z ?free_image_mem@experimental@oneapi@ext@_V1@sycl@@YAXUimage_mem_handle@12345@W4image_type@12345@AEBVdevice@45@AEBVcontext@45@@Z ?free_image_mem@experimental@oneapi@ext@_V1@sycl@@YAXUimage_mem_handle@12345@W4image_type@12345@AEBVqueue@45@@Z +?free_image_mem@experimental@oneapi@ext@_V1@sycl@@YAXUimage_mem_handle@12345@AEBVdevice@45@AEBVcontext@45@@Z ?free_mipmap_mem@experimental@oneapi@ext@_V1@sycl@@YAXUimage_mem_handle@12345@AEBVdevice@45@AEBVcontext@45@@Z ?free_mipmap_mem@experimental@oneapi@ext@_V1@sycl@@YAXUimage_mem_handle@12345@AEBVqueue@45@@Z +?free_mipmap_mem@experimental@oneapi@ext@_V1@sycl@@YAXUimage_mem_handle@12345@AEBVdevice@45@AEBVcontext@45@@Z ?frexp_impl@detail@_V1@sycl@@YA?AVhalf@half_impl@123@V45123@PEAH@Z ?frexp_impl@detail@_V1@sycl@@YAMMPEAH@Z ?frexp_impl@detail@_V1@sycl@@YANNPEAH@Z @@ -4170,6 +4185,7 @@ ?getStartTime@HostProfilingInfo@detail@_V1@sycl@@QEBA_KXZ ?getType@handler@_V1@sycl@@AEAA?AW4CGTYPE@CG@detail@23@XZ ?getValueFromDynamicParameter@detail@_V1@sycl@@YAPEAXAEAVdynamic_parameter_base@1experimental@oneapi@ext@23@@Z +?get_access_mode@experimental@oneapi@ext@_V1@sycl@@YA?AW4address_access_mode@12345@PEBX_KAEBVcontext@45@@Z ?get_addressing_mode@sampler@_V1@sycl@@QEBA?AW4addressing_mode@23@XZ ?get_allocator_internal@buffer_plain@detail@_V1@sycl@@IEBAAEBV?$unique_ptr@VSYCLMemObjAllocator@detail@_V1@sycl@@U?$default_delete@VSYCLMemObjAllocator@detail@_V1@sycl@@@std@@@std@@XZ ?get_allocator_internal@image_plain@detail@_V1@sycl@@IEBAAEBV?$unique_ptr@VSYCLMemObjAllocator@detail@_V1@sycl@@U?$default_delete@VSYCLMemObjAllocator@detail@_V1@sycl@@@std@@@std@@XZ @@ -4189,10 +4205,12 @@ ?get_context@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AVcontext@56@XZ ?get_context@kernel@_V1@sycl@@QEBA?AVcontext@23@XZ ?get_context@kernel_bundle_plain@detail@_V1@sycl@@QEBA?AVcontext@34@XZ +?get_context@physical_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AVcontext@56@XZ ?get_context@queue@_V1@sycl@@QEBA?AVcontext@23@XZ ?get_coordinate_normalization_mode@sampler@_V1@sycl@@QEBA?AW4coordinate_normalization_mode@23@XZ ?get_count@image_plain@detail@_V1@sycl@@IEBA_KXZ ?get_descriptor@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBAAEBUimage_descriptor@23456@XZ +?get_device@physical_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AVdevice@56@XZ ?get_device@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AVdevice@56@XZ ?get_device@queue@_V1@sycl@@QEBA?AVdevice@23@XZ ?get_devices@context@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@XZ @@ -4218,6 +4236,8 @@ ?get_kernel_ids@_V1@sycl@@YA?AV?$vector@Vkernel_id@_V1@sycl@@V?$allocator@Vkernel_id@_V1@sycl@@@std@@@std@@XZ ?get_kernel_ids@kernel_bundle_plain@detail@_V1@sycl@@QEBA?AV?$vector@Vkernel_id@_V1@sycl@@V?$allocator@Vkernel_id@_V1@sycl@@@std@@@std@@XZ ?get_max_statement_size@stream@_V1@sycl@@QEBA_KXZ +?get_mem_granularity@experimental@oneapi@ext@_V1@sycl@@YA_KAEBVcontext@45@W4granularity_mode@12345@@Z +?get_mem_granularity@experimental@oneapi@ext@_V1@sycl@@YA_KAEBVdevice@45@AEBVcontext@45@W4granularity_mode@12345@@Z ?get_mip_level_mem_handle@experimental@oneapi@ext@_V1@sycl@@YA?AUimage_mem_handle@12345@U612345@IAEBVdevice@45@AEBVcontext@45@@Z ?get_mip_level_mem_handle@experimental@oneapi@ext@_V1@sycl@@YA?AUimage_mem_handle@12345@U612345@IAEBVqueue@45@@Z ?get_mip_level_mem_handle@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AUimage_mem_handle@23456@I@Z @@ -4327,6 +4347,7 @@ ?malloc_shared@_V1@sycl@@YAPEAX_KAEBVdevice@12@AEBVcontext@12@AEBVproperty_list@12@AEBUcode_location@detail@12@@Z ?malloc_shared@_V1@sycl@@YAPEAX_KAEBVqueue@12@AEBUcode_location@detail@12@@Z ?malloc_shared@_V1@sycl@@YAPEAX_KAEBVqueue@12@AEBVproperty_list@12@AEBUcode_location@detail@12@@Z +?map@physical_mem@experimental@oneapi@ext@_V1@sycl@@QEBAPEAX_K0W4address_access_mode@23456@0@Z ?map_external_image_memory@experimental@oneapi@ext@_V1@sycl@@YA?AUimage_mem_handle@12345@Uinterop_mem_handle@12345@AEBUimage_descriptor@12345@AEBVdevice@45@AEBVcontext@45@@Z ?map_external_image_memory@experimental@oneapi@ext@_V1@sycl@@YA?AUimage_mem_handle@12345@Uinterop_mem_handle@12345@AEBUimage_descriptor@12345@AEBVqueue@45@@Z ?map_external_memory_array@experimental@oneapi@ext@_V1@sycl@@YA?AUimage_mem_handle@12345@Uinterop_mem_handle@12345@AEBUimage_descriptor@12345@AEBVdevice@45@AEBVcontext@45@@Z @@ -4389,6 +4410,7 @@ ?remquo_impl@detail@_V1@sycl@@YA?AVhalf@half_impl@123@V45123@0PEAH@Z ?remquo_impl@detail@_V1@sycl@@YAMMMPEAH@Z ?remquo_impl@detail@_V1@sycl@@YANNNPEAH@Z +?reserve_virtual_mem@experimental@oneapi@ext@_V1@sycl@@YA_K_K0AEBVcontext@45@@Z ?reset@filter_selector@ONEAPI@_V1@sycl@@QEBAXXZ ?reset@filter_selector@oneapi@ext@_V1@sycl@@QEBAXXZ ?sampledImageConstructorNotification@detail@_V1@sycl@@YAXPEAX0AEBV?$optional@W4image_target@_V1@sycl@@@std@@PEBXIAEBUcode_location@123@@Z @@ -4412,6 +4434,7 @@ ?setStateSpecConstSet@handler@_V1@sycl@@AEAAXXZ ?setType@handler@_V1@sycl@@AEAAXW4CGTYPE@CG@detail@23@@Z ?setUserFacingNodeType@handler@_V1@sycl@@AEAAXW4node_type@experimental@oneapi@ext@23@@Z +?set_access_mode@experimental@oneapi@ext@_V1@sycl@@YAXPEBX_KW4address_access_mode@12345@AEBVcontext@45@@Z ?set_final_data_internal@buffer_plain@detail@_V1@sycl@@IEAAXAEBV?$function@$$A6AXAEBV?$function@$$A6AXPEAX@Z@std@@@Z@std@@@Z ?set_final_data_internal@buffer_plain@detail@_V1@sycl@@IEAAXXZ ?set_final_data_internal@image_plain@detail@_V1@sycl@@IEAAXAEBV?$function@$$A6AXAEBV?$function@$$A6AXPEAX@Z@std@@@Z@std@@@Z @@ -4427,6 +4450,7 @@ ?sincos_impl@detail@_V1@sycl@@YANNPEAN@Z ?single_task@handler@_V1@sycl@@QEAAXVkernel@23@@Z ?size@exception_list@_V1@sycl@@QEBA_KXZ +?size@physical_mem@experimental@oneapi@ext@_V1@sycl@@QEBA_KXZ ?size@stream@_V1@sycl@@QEBA_KXZ ?start@HostProfilingInfo@detail@_V1@sycl@@QEAAXXZ ?start_fusion@fusion_wrapper@experimental@codeplay@ext@_V1@sycl@@QEAAXXZ @@ -4442,6 +4466,7 @@ ?sycl_category@_V1@sycl@@YAAEBVerror_category@std@@XZ ?throwIfActionIsCreated@handler@_V1@sycl@@AEAAXXZ ?throw_asynchronous@queue@_V1@sycl@@QEAAXXZ +?unmap@experimental@oneapi@ext@_V1@sycl@@YAXPEBX_KAEBVcontext@45@@Z ?unsampledImageConstructorNotification@detail@_V1@sycl@@YAXPEAX0AEBV?$optional@W4image_target@_V1@sycl@@@std@@W4mode@access@23@PEBXIAEBUcode_location@123@@Z ?unsampledImageConstructorNotification@image_plain@detail@_V1@sycl@@IEAAXAEBUcode_location@234@PEAXPEBXIQEA_KW4image_format@34@@Z ?unsampledImageDestructorNotification@image_plain@detail@_V1@sycl@@IEAAXPEAX@Z diff --git a/sycl/unittests/helpers/PiMockPlugin.hpp b/sycl/unittests/helpers/PiMockPlugin.hpp index ca29b9bd6aa1e..b7fea5aae4ff9 100644 --- a/sycl/unittests/helpers/PiMockPlugin.hpp +++ b/sycl/unittests/helpers/PiMockPlugin.hpp @@ -1353,6 +1353,61 @@ inline pi_result mock_piextEnqueueDeviceGlobalVariableRead( return PI_SUCCESS; } +inline pi_result +mock_piextVirtualMemGranularityGetInfo(pi_context, pi_device, + pi_virtual_mem_granularity_info, size_t, + void *, size_t *) { + return PI_SUCCESS; +} + +inline pi_result +mock_piextPhysicalMemCreate(pi_context, pi_device, size_t, + pi_physical_mem *ret_physical_mem) { + *ret_physical_mem = createDummyHandle(); + return PI_SUCCESS; +} + +inline pi_result mock_piextPhysicalMemRetain(pi_physical_mem) { + return PI_SUCCESS; +} + +inline pi_result mock_piextPhysicalMemRelease(pi_physical_mem) { + return PI_SUCCESS; +} + +inline pi_result mock_piextVirtualMemReserve(pi_context, const void *start, + size_t range_size, + void **ret_ptr) { + *ret_ptr = + start ? const_cast(start) : createDummyHandle(range_size); + return PI_SUCCESS; +} + +inline pi_result mock_piextVirtualMemFree(pi_context, const void *, size_t) { + return PI_SUCCESS; +} + +inline pi_result mock_piextVirtualMemMap(pi_context, const void *, size_t, + pi_physical_mem, size_t, + pi_virtual_access_flags) { + return PI_SUCCESS; +} + +inline pi_result mock_piextVirtualMemUnmap(pi_context, const void *, size_t) { + return PI_SUCCESS; +} + +inline pi_result mock_piextVirtualMemSetAccess(pi_context, const void *, size_t, + pi_virtual_access_flags) { + return PI_SUCCESS; +} + +inline pi_result mock_piextVirtualMemGetInfo(pi_context, const void *, size_t, + pi_virtual_mem_info, size_t, + void *, size_t *) { + return PI_SUCCESS; +} + inline pi_result mock_piextPluginGetOpaqueData(void *opaque_data_param, void **opaque_data_return) { return PI_SUCCESS; From 719e8ef8c0269ab23cd46eaa1d2d2c751a2bcbce Mon Sep 17 00:00:00 2001 From: Chris Perkins Date: Mon, 1 Jul 2024 08:59:03 -0700 Subject: [PATCH 58/58] [SYCL] no exceptions leaking from destructors (#14273) Destructors are implicitly noexcept, so we must ensure they don't actually throw exceptions. No change to API or ABI with this PR. --- sycl/include/sycl/buffer.hpp | 8 ++- sycl/include/sycl/detail/common.hpp | 11 ++++ sycl/include/sycl/image.hpp | 13 ++++- sycl/include/syclcompat/device.hpp | 10 ++-- sycl/source/detail/context_impl.cpp | 35 +++++++------ sycl/source/detail/device_image_impl.hpp | 21 ++++---- sycl/source/detail/event_impl.cpp | 8 ++- sycl/source/detail/global_handler.cpp | 31 ++++++++---- sycl/source/detail/graph_impl.cpp | 58 ++++++++++++---------- sycl/source/detail/kernel_impl.cpp | 8 ++- sycl/source/detail/pi_utils.hpp | 11 ++-- sycl/source/detail/program_impl.cpp | 12 +++-- sycl/source/detail/queue_impl.hpp | 36 ++++++++------ sycl/source/detail/sampler_impl.cpp | 15 ++++-- sycl/source/detail/thread_pool.hpp | 8 ++- sycl/unittests/thread_safety/ThreadUtils.h | 8 ++- 16 files changed, 192 insertions(+), 101 deletions(-) diff --git a/sycl/include/sycl/buffer.hpp b/sycl/include/sycl/buffer.hpp index 5dde105b678e6..32588de22c980 100644 --- a/sycl/include/sycl/buffer.hpp +++ b/sycl/include/sycl/buffer.hpp @@ -472,7 +472,13 @@ class buffer : public detail::buffer_plain, buffer &operator=(buffer &&rhs) = default; - ~buffer() { buffer_plain::handleRelease(); } + ~buffer() { + try { + buffer_plain::handleRelease(); + } catch (std::exception &e) { + __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~buffer", e); + } + } bool operator==(const buffer &rhs) const { return impl == rhs.impl; } diff --git a/sycl/include/sycl/detail/common.hpp b/sycl/include/sycl/detail/common.hpp index 1c940a21a7223..09c31ef76ef6d 100644 --- a/sycl/include/sycl/detail/common.hpp +++ b/sycl/include/sycl/detail/common.hpp @@ -368,6 +368,17 @@ static constexpr std::array RepeatValue(const T &Arg) { return RepeatValueHelper(Arg, std::make_index_sequence()); } +// to output exceptions caught in ~destructors +#ifndef NDEBUG +#define __SYCL_REPORT_EXCEPTION_TO_STREAM(str, e) \ + { \ + std::cerr << str << " " << e.what() << std::endl; \ + assert(false); \ + } +#else +#define __SYCL_REPORT_EXCEPTION_TO_STREAM(str, e) +#endif + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/include/sycl/image.hpp b/sycl/include/sycl/image.hpp index 1239f65cdd259..2d0401764bbe9 100644 --- a/sycl/include/sycl/image.hpp +++ b/sycl/include/sycl/image.hpp @@ -954,7 +954,12 @@ class unsampled_image unsampled_image &operator=(unsampled_image &&rhs) = default; ~unsampled_image() { - common_base::unsampledImageDestructorNotification((void *)this->impl.get()); + try { + common_base::unsampledImageDestructorNotification( + (void *)this->impl.get()); + } catch (std::exception &e) { + __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~unsampled_image", e); + } } bool operator==(const unsampled_image &rhs) const { @@ -1095,7 +1100,11 @@ class sampled_image sampled_image &operator=(sampled_image &&rhs) = default; ~sampled_image() { - common_base::sampledImageDestructorNotification((void *)this->impl.get()); + try { + common_base::sampledImageDestructorNotification((void *)this->impl.get()); + } catch (std::exception &e) { + __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~sampled_image", e); + } } bool operator==(const sampled_image &rhs) const { diff --git a/sycl/include/syclcompat/device.hpp b/sycl/include/syclcompat/device.hpp index ed16a9b32bfa4..3e3e6cb77e71d 100644 --- a/sycl/include/syclcompat/device.hpp +++ b/sycl/include/syclcompat/device.hpp @@ -339,9 +339,13 @@ class device_ext : public sycl::device { public: device_ext() : sycl::device(), _ctx(*this) {} ~device_ext() { - std::lock_guard lock(m_mutex); - sycl::event::wait(_events); - _queues.clear(); + try { + std::lock_guard lock(m_mutex); + sycl::event::wait(_events); + _queues.clear(); + } catch (std::exception &e) { + __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~device_ext", e); + } } device_ext(const sycl::device &base, bool print_on_async_exceptions = false, bool in_order = true) diff --git a/sycl/source/detail/context_impl.cpp b/sycl/source/detail/context_impl.cpp index 8ae13b345b250..910f731071837 100644 --- a/sycl/source/detail/context_impl.cpp +++ b/sycl/source/detail/context_impl.cpp @@ -136,23 +136,26 @@ cl_context context_impl::get() const { } context_impl::~context_impl() { - // Free all events associated with the initialization of device globals. - for (auto &DeviceGlobalInitializer : MDeviceGlobalInitializers) - DeviceGlobalInitializer.second.ClearEvents(getPlugin()); - // Free all device_global USM allocations associated with this context. - for (const void *DeviceGlobal : MAssociatedDeviceGlobals) { - DeviceGlobalMapEntry *DGEntry = - detail::ProgramManager::getInstance().getDeviceGlobalEntry( - DeviceGlobal); - DGEntry->removeAssociatedResources(this); - } - for (auto LibProg : MCachedLibPrograms) { - assert(LibProg.second && "Null program must not be kept in the cache"); - getPlugin()->call(LibProg.second); + try { + // Free all events associated with the initialization of device globals. + for (auto &DeviceGlobalInitializer : MDeviceGlobalInitializers) + DeviceGlobalInitializer.second.ClearEvents(getPlugin()); + // Free all device_global USM allocations associated with this context. + for (const void *DeviceGlobal : MAssociatedDeviceGlobals) { + DeviceGlobalMapEntry *DGEntry = + detail::ProgramManager::getInstance().getDeviceGlobalEntry( + DeviceGlobal); + DGEntry->removeAssociatedResources(this); + } + for (auto LibProg : MCachedLibPrograms) { + assert(LibProg.second && "Null program must not be kept in the cache"); + getPlugin()->call(LibProg.second); + } + // TODO catch an exception and put it to list of asynchronous exceptions + getPlugin()->call(MContext); + } catch (std::exception &e) { + __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~context_impl", e); } - - // TODO catch an exception and put it to list of asynchronous exceptions - getPlugin()->call_nocheck(MContext); } const async_handler &context_impl::get_async_handler() const { diff --git a/sycl/source/detail/device_image_impl.hpp b/sycl/source/detail/device_image_impl.hpp index f21bf3ccd0185..eda511e340d10 100644 --- a/sycl/source/detail/device_image_impl.hpp +++ b/sycl/source/detail/device_image_impl.hpp @@ -300,15 +300,18 @@ class device_image_impl { } ~device_image_impl() { - - if (MProgram) { - const PluginPtr &Plugin = getSyclObjImpl(MContext)->getPlugin(); - Plugin->call(MProgram); - } - if (MSpecConstsBuffer) { - std::lock_guard Lock{MSpecConstAccessMtx}; - const PluginPtr &Plugin = getSyclObjImpl(MContext)->getPlugin(); - memReleaseHelper(Plugin, MSpecConstsBuffer); + try { + if (MProgram) { + const PluginPtr &Plugin = getSyclObjImpl(MContext)->getPlugin(); + Plugin->call(MProgram); + } + if (MSpecConstsBuffer) { + std::lock_guard Lock{MSpecConstAccessMtx}; + const PluginPtr &Plugin = getSyclObjImpl(MContext)->getPlugin(); + memReleaseHelper(Plugin, MSpecConstsBuffer); + } + } catch (std::exception &e) { + __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~device_image_impl", e); } } diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index 85afb56fcaf9b..097cef03b4d66 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -43,8 +43,12 @@ void event_impl::initContextIfNeeded() { } event_impl::~event_impl() { - if (MEvent) - getPlugin()->call(MEvent); + try { + if (MEvent) + getPlugin()->call(MEvent); + } catch (std::exception &e) { + __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~event_impl", e); + } } void event_impl::waitInternal(bool *Success) { diff --git a/sycl/source/detail/global_handler.cpp b/sycl/source/detail/global_handler.cpp index 072a9628d6a6b..301125d9b5c93 100644 --- a/sycl/source/detail/global_handler.cpp +++ b/sycl/source/detail/global_handler.cpp @@ -54,14 +54,18 @@ class ObjectUsageCounter { MCounter++; } ~ObjectUsageCounter() { - if (!MModifyCounter) - return; - - LockGuard Guard(GlobalHandler::MSyclGlobalHandlerProtector); - MCounter--; - GlobalHandler *RTGlobalObjHandler = GlobalHandler::getInstancePtr(); - if (RTGlobalObjHandler) { - RTGlobalObjHandler->prepareSchedulerToRelease(!MCounter); + try { + if (!MModifyCounter) + return; + + LockGuard Guard(GlobalHandler::MSyclGlobalHandlerProtector); + MCounter--; + GlobalHandler *RTGlobalObjHandler = GlobalHandler::getInstancePtr(); + if (RTGlobalObjHandler) { + RTGlobalObjHandler->prepareSchedulerToRelease(!MCounter); + } + } catch (std::exception &e) { + __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~ObjectUsageCounter", e); } } @@ -234,12 +238,17 @@ void GlobalHandler::releaseDefaultContexts() { struct EarlyShutdownHandler { ~EarlyShutdownHandler() { + try { #ifdef _WIN32 - // on Windows we keep to the existing shutdown procedure - GlobalHandler::instance().releaseDefaultContexts(); + // on Windows we keep to the existing shutdown procedure + GlobalHandler::instance().releaseDefaultContexts(); #else - shutdown_early(); + shutdown_early(); #endif + } catch (std::exception &e) { + __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~EarlyShutdownHandler", + e); + } } }; diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 09ccef30dacd2..9ef8ce262932f 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -297,9 +297,13 @@ void exec_graph_impl::makePartitions() { } graph_impl::~graph_impl() { - clearQueues(); - for (auto &MemObj : MMemObjs) { - MemObj->markNoLongerBeingUsedInGraph(); + try { + clearQueues(); + for (auto &MemObj : MMemObjs) { + MemObj->markNoLongerBeingUsedInGraph(); + } + } catch (std::exception &e) { + __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~graph_impl", e); } } @@ -784,34 +788,38 @@ exec_graph_impl::exec_graph_impl(sycl::context Context, } exec_graph_impl::~exec_graph_impl() { - const sycl::detail::PluginPtr &Plugin = - sycl::detail::getSyclObjImpl(MContext)->getPlugin(); - MSchedule.clear(); - // We need to wait on all command buffer executions before we can release - // them. - for (auto &Event : MExecutionEvents) { - Event->wait(Event); - } + try { + const sycl::detail::PluginPtr &Plugin = + sycl::detail::getSyclObjImpl(MContext)->getPlugin(); + MSchedule.clear(); + // We need to wait on all command buffer executions before we can release + // them. + for (auto &Event : MExecutionEvents) { + Event->wait(Event); + } - for (const auto &Partition : MPartitions) { - Partition->MSchedule.clear(); - for (const auto &Iter : Partition->MPiCommandBuffers) { - if (auto CmdBuf = Iter.second; CmdBuf) { + for (const auto &Partition : MPartitions) { + Partition->MSchedule.clear(); + for (const auto &Iter : Partition->MPiCommandBuffers) { + if (auto CmdBuf = Iter.second; CmdBuf) { + pi_result Res = Plugin->call_nocheck< + sycl::detail::PiApiKind::piextCommandBufferRelease>(CmdBuf); + (void)Res; + assert(Res == pi_result::PI_SUCCESS); + } + } + } + + for (auto &Iter : MCommandMap) { + if (auto Command = Iter.second; Command) { pi_result Res = Plugin->call_nocheck< - sycl::detail::PiApiKind::piextCommandBufferRelease>(CmdBuf); + sycl::detail::PiApiKind::piextCommandBufferReleaseCommand>(Command); (void)Res; assert(Res == pi_result::PI_SUCCESS); } } - } - - for (auto &Iter : MCommandMap) { - if (auto Command = Iter.second; Command) { - pi_result Res = Plugin->call_nocheck< - sycl::detail::PiApiKind::piextCommandBufferReleaseCommand>(Command); - (void)Res; - assert(Res == pi_result::PI_SUCCESS); - } + } catch (std::exception &e) { + __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~exec_graph_impl", e); } } diff --git a/sycl/source/detail/kernel_impl.cpp b/sycl/source/detail/kernel_impl.cpp index b4ab6b232eef9..8502f3489b9c7 100644 --- a/sycl/source/detail/kernel_impl.cpp +++ b/sycl/source/detail/kernel_impl.cpp @@ -75,8 +75,12 @@ kernel_impl::kernel_impl(ContextImplPtr Context, ProgramImplPtr ProgramImpl) : MContext(Context), MProgram(ProgramImpl->getHandleRef()) {} kernel_impl::~kernel_impl() { - // TODO catch an exception and put it to list of asynchronous exceptions - getPlugin()->call(MKernel); + try { + // TODO catch an exception and put it to list of asynchronous exceptions + getPlugin()->call(MKernel); + } catch (std::exception &e) { + __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~kernel_impl", e); + } } bool kernel_impl::isCreatedFromSource() const { diff --git a/sycl/source/detail/pi_utils.hpp b/sycl/source/detail/pi_utils.hpp index 877cbd0d14e52..fa288c91fc583 100644 --- a/sycl/source/detail/pi_utils.hpp +++ b/sycl/source/detail/pi_utils.hpp @@ -31,9 +31,14 @@ struct OwnedPiEvent { MPlugin->call(*MEvent); } ~OwnedPiEvent() { - // Release the event if the ownership was not transferred. - if (MEvent.has_value()) - MPlugin->call(*MEvent); + try { + // Release the event if the ownership was not transferred. + if (MEvent.has_value()) + MPlugin->call(*MEvent); + + } catch (std::exception &e) { + __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~OwnedPiEvent", e); + } } OwnedPiEvent(OwnedPiEvent &&Other) diff --git a/sycl/source/detail/program_impl.cpp b/sycl/source/detail/program_impl.cpp index f3ac2185627f9..ca5628fb1a8d6 100644 --- a/sycl/source/detail/program_impl.cpp +++ b/sycl/source/detail/program_impl.cpp @@ -203,10 +203,14 @@ program_impl::program_impl(ContextImplPtr Context, } program_impl::~program_impl() { - // TODO catch an exception and put it to list of asynchronous exceptions - if (MProgram != nullptr) { - const PluginPtr &Plugin = getPlugin(); - Plugin->call(MProgram); + try { + // TODO catch an exception and put it to list of asynchronous exceptions + if (MProgram != nullptr) { + const PluginPtr &Plugin = getPlugin(); + Plugin->call(MProgram); + } + } catch (std::exception &e) { + __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~program_impl", e); } } diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 123efc3d87af6..ccaf52cccd408 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -306,24 +306,28 @@ class queue_impl { } ~queue_impl() { - // The trace event created in the constructor should be active through the - // lifetime of the queue object as member variables when ABI breakage is - // allowed. This example shows MTraceEvent as a member variable. + try { + // The trace event created in the constructor should be active through the + // lifetime of the queue object as member variables when ABI breakage is + // allowed. This example shows MTraceEvent as a member variable. #if XPTI_ENABLE_INSTRUMENTATION - constexpr uint16_t NotificationTraceType = - static_cast(xpti::trace_point_type_t::queue_destroy); - if (xptiCheckTraceEnabled(MStreamID, NotificationTraceType)) { - // Used cached information in member variables - xptiNotifySubscribers(MStreamID, NotificationTraceType, nullptr, - (xpti::trace_event_data_t *)MTraceEvent, - MInstanceID, - static_cast("queue_destroy")); - xptiReleaseEvent((xpti::trace_event_data_t *)MTraceEvent); - } + constexpr uint16_t NotificationTraceType = + static_cast(xpti::trace_point_type_t::queue_destroy); + if (xptiCheckTraceEnabled(MStreamID, NotificationTraceType)) { + // Used cached information in member variables + xptiNotifySubscribers(MStreamID, NotificationTraceType, nullptr, + (xpti::trace_event_data_t *)MTraceEvent, + MInstanceID, + static_cast("queue_destroy")); + xptiReleaseEvent((xpti::trace_event_data_t *)MTraceEvent); + } #endif - throw_asynchronous(); - cleanup_fusion_cmd(); - getPlugin()->call(MQueues[0]); + throw_asynchronous(); + cleanup_fusion_cmd(); + getPlugin()->call(MQueues[0]); + } catch (std::exception &e) { + __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~queue_impl", e); + } } /// \return an OpenCL interoperability queue handle. diff --git a/sycl/source/detail/sampler_impl.cpp b/sycl/source/detail/sampler_impl.cpp index c2af7884a164c..edca8eb1be025 100644 --- a/sycl/source/detail/sampler_impl.cpp +++ b/sycl/source/detail/sampler_impl.cpp @@ -40,11 +40,16 @@ sampler_impl::sampler_impl(cl_sampler clSampler, const context &syclContext) { } sampler_impl::~sampler_impl() { - std::lock_guard Lock(MMutex); - for (auto &Iter : MContextToSampler) { - // TODO catch an exception and add it to the list of asynchronous exceptions - const PluginPtr &Plugin = getSyclObjImpl(Iter.first)->getPlugin(); - Plugin->call(Iter.second); + try { + std::lock_guard Lock(MMutex); + for (auto &Iter : MContextToSampler) { + // TODO catch an exception and add it to the list of asynchronous + // exceptions + const PluginPtr &Plugin = getSyclObjImpl(Iter.first)->getPlugin(); + Plugin->call(Iter.second); + } + } catch (std::exception &e) { + __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~sample_impl", e); } } diff --git a/sycl/source/detail/thread_pool.hpp b/sycl/source/detail/thread_pool.hpp index 35adb98e9d570..304045389b53b 100644 --- a/sycl/source/detail/thread_pool.hpp +++ b/sycl/source/detail/thread_pool.hpp @@ -74,7 +74,13 @@ class ThreadPool { start(); } - ~ThreadPool() { finishAndWait(); } + ~ThreadPool() { + try { + finishAndWait(); + } catch (std::exception &e) { + __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~ThreadPool", e); + } + } void finishAndWait() { MStop.store(true); diff --git a/sycl/unittests/thread_safety/ThreadUtils.h b/sycl/unittests/thread_safety/ThreadUtils.h index ccbca98d44e3f..4b40123ba1bb7 100644 --- a/sycl/unittests/thread_safety/ThreadUtils.h +++ b/sycl/unittests/thread_safety/ThreadUtils.h @@ -48,7 +48,13 @@ class ThreadPool { enqueueHelper(std::forward(funcs)...); } - ~ThreadPool() { wait(); } + ~ThreadPool() { + try { + wait(); + } catch (std::exception &e) { + std::cerr << "exception in ~ThreadPool" << e.what() << std::endl; + } + } private: template