diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h index b7eb25ed62ec0..4d3e841f731e1 100644 --- a/sycl/include/CL/sycl/detail/pi.h +++ b/sycl/include/CL/sycl/detail/pi.h @@ -305,6 +305,8 @@ typedef enum { PI_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE = 0x10025, PI_DEVICE_INFO_MAX_MEM_BANDWIDTH = 0x10026, PI_DEVICE_INFO_IMAGE_SRGB = 0x10027, + // Return true if sub-device should do its own program build + PI_DEVICE_INFO_BUILD_ON_SUBDEVICE = 0x10028, PI_DEVICE_INFO_ATOMIC_64 = 0x10110, PI_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = 0x10111, PI_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES = 0x11000, diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 7a1ac28ebc055..316b8681605bb 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -1490,6 +1490,10 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name, return getInfo(param_value_size, param_value, param_value_size_ret, PI_TRUE); } + case PI_DEVICE_INFO_BUILD_ON_SUBDEVICE: { + return getInfo(param_value_size, param_value, param_value_size_ret, + PI_TRUE); + } case PI_DEVICE_INFO_COMPILER_AVAILABLE: { return getInfo(param_value_size, param_value, param_value_size_ret, PI_TRUE); diff --git a/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp b/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp index 1f0cba964b8bc..70dec8ab9816b 100644 --- a/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp +++ b/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp @@ -667,6 +667,8 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, return ReturnValue(""); case PI_DEVICE_INFO_VERSION: return ReturnValue(Device->VersionStr.c_str()); + case PI_DEVICE_INFO_BUILD_ON_SUBDEVICE: // emulator doesn't support partition + return ReturnValue(pi_bool{true}); case PI_DEVICE_INFO_COMPILER_AVAILABLE: return ReturnValue(pi_bool{false}); case PI_DEVICE_INFO_LINKER_AVAILABLE: diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index 06e6d862a62d0..9c6343e8f7395 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -1409,6 +1409,10 @@ pi_result hip_piDeviceGetInfo(pi_device device, pi_device_info param_name, return getInfo(param_value_size, param_value, param_value_size_ret, PI_TRUE); } + case PI_DEVICE_INFO_BUILD_ON_SUBDEVICE: { + return getInfo(param_value_size, param_value, param_value_size_ret, + PI_TRUE); + } case PI_DEVICE_INFO_COMPILER_AVAILABLE: { return getInfo(param_value_size, param_value, param_value_size_ret, PI_TRUE); diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 181610b1328b6..98ed68a7c41a9 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -2028,7 +2028,7 @@ pi_result piextPlatformCreateWithNativeHandle(pi_native_handle NativeHandle, return PI_INVALID_VALUE; } -// Get the cahched PI device created for the L0 device handle. +// Get the cached PI device created for the L0 device handle. // Return NULL if no such PI device found. pi_device _pi_platform::getDeviceFromNativeHandle(ze_device_handle_t ZeDevice) { @@ -2188,6 +2188,11 @@ pi_result _pi_platform::populateDeviceCacheIfNeeded() { // Create PI sub-sub-devices with the sub-device for all the ordinals. // Each {ordinal, index} points to a specific CCS which constructs // a sub-sub-device at this point. + // FIXME: Level Zero creates multiple PiDevices for a single physical + // device when sub-device is partitioned into sub-sub-devices. + // Sub-sub-device is technically a command queue and we should not build + // program for each command queue. PiDevice is probably not the right + // abstraction for a Level Zero command queue. for (uint32_t J = 0; J < Ordinals.size(); ++J) { for (uint32_t K = 0; K < QueueGroupProperties[Ordinals[J]].numQueues; ++K) { @@ -2276,8 +2281,7 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, } } case PI_DEVICE_INFO_PARENT_DEVICE: - // TODO: all Level Zero devices are parent ? - return ReturnValue(pi_device{0}); + return ReturnValue(Device->RootDevice); case PI_DEVICE_INFO_PLATFORM: return ReturnValue(Device->Platform); case PI_DEVICE_INFO_VENDOR_ID: @@ -2337,6 +2341,11 @@ pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName, } case PI_DEVICE_INFO_NAME: return ReturnValue(Device->ZeDeviceProperties->name); + // zeModuleCreate allows using root device module for sub-devices: + // > The application must only use the module for the device, or its + // > sub-devices, which was provided during creation. + case PI_DEVICE_INFO_BUILD_ON_SUBDEVICE: + return ReturnValue(PI_FALSE); case PI_DEVICE_INFO_COMPILER_AVAILABLE: return ReturnValue(pi_bool{1}); case PI_DEVICE_INFO_LINKER_AVAILABLE: diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 8123da1c709d2..2f278dcdaded1 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -203,7 +203,17 @@ pi_result piDeviceGetInfo(pi_device device, pi_device_info paramName, std::memcpy(paramValue, &result, sizeof(cl_bool)); return PI_SUCCESS; } + case PI_DEVICE_INFO_BUILD_ON_SUBDEVICE: { + cl_device_type devType = CL_DEVICE_TYPE_DEFAULT; + cl_int res = clGetDeviceInfo(cast(device), CL_DEVICE_TYPE, + sizeof(cl_device_type), &devType, nullptr); + // FIXME: here we assume that program built for a root GPU device can be + // used on its sub-devices without re-building + cl_bool result = (res == CL_SUCCESS) && (devType == CL_DEVICE_TYPE_GPU); + std::memcpy(paramValue, &result, sizeof(cl_bool)); + return PI_SUCCESS; + } case PI_EXT_ONEAPI_DEVICE_INFO_MAX_WORK_GROUPS_3D: // Returns the maximum sizes of a work group for each dimension one // could use to submit a kernel. There is no such query defined in OpenCL diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp index e5337b97d8521..8fe484d61d038 100644 --- a/sycl/source/detail/device_impl.cpp +++ b/sycl/source/detail/device_impl.cpp @@ -53,13 +53,11 @@ device_impl::device_impl(pi_native_handle InteropDeviceHandle, Plugin.call( MDevice, PI_DEVICE_INFO_TYPE, sizeof(RT::PiDeviceType), &MType, nullptr); - RT::PiDevice parent = nullptr; // TODO catch an exception and put it to list of asynchronous exceptions Plugin.call(MDevice, PI_DEVICE_INFO_PARENT_DEVICE, - sizeof(RT::PiDevice), &parent, + sizeof(RT::PiDevice), &MRootDevice, nullptr); - MIsRootDevice = (nullptr == parent); if (!InteroperabilityConstructor) { // TODO catch an exception and put it to list of asynchronous exceptions // Interoperability Constructor already calls DeviceRetain in diff --git a/sycl/source/detail/device_impl.hpp b/sycl/source/detail/device_impl.hpp index 650569225fa4c..91af40d86b3e5 100644 --- a/sycl/source/detail/device_impl.hpp +++ b/sycl/source/detail/device_impl.hpp @@ -226,6 +226,8 @@ class device_impl { bool isAssertFailSupported() const; + bool isRootDevice() const { return MRootDevice == nullptr; } + std::string getDeviceName() const; private: @@ -233,7 +235,7 @@ class device_impl { PlatformImplPtr Platform, const plugin &Plugin); RT::PiDevice MDevice = 0; RT::PiDeviceType MType; - bool MIsRootDevice = false; + RT::PiDevice MRootDevice = nullptr; bool MIsHostDevice; PlatformImplPtr MPlatform; bool MIsAssertFailSupported = false; diff --git a/sycl/source/detail/persistent_device_code_cache.cpp b/sycl/source/detail/persistent_device_code_cache.cpp index 0ad560b6094ca..7b2e9bb769760 100644 --- a/sycl/source/detail/persistent_device_code_cache.cpp +++ b/sycl/source/detail/persistent_device_code_cache.cpp @@ -78,10 +78,13 @@ void PersistentDeviceCodeCache::putItemToDisc( const SerializedObj &SpecConsts, const std::string &BuildOptionsString, const RT::PiProgram &NativePrg) { + if (!isImageCached(Img)) + return; + std::string DirName = getCacheItemPath(Device, Img, SpecConsts, BuildOptionsString); - if (!isImageCached(Img) || DirName.empty()) + if (DirName.empty()) return; auto Plugin = detail::getSyclObjImpl(Device)->getPlugin(); @@ -137,10 +140,13 @@ std::vector> PersistentDeviceCodeCache::getItemFromDisc( const device &Device, const RTDeviceBinaryImage &Img, const SerializedObj &SpecConsts, const std::string &BuildOptionsString) { + if (!isImageCached(Img)) + return {}; + std::string Path = getCacheItemPath(Device, Img, SpecConsts, BuildOptionsString); - if (!isImageCached(Img) || Path.empty() || !OSUtil::isPathPresent(Path)) + if (Path.empty() || !OSUtil::isPathPresent(Path)) return {}; int i = 0; diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index 7ba084f315890..29601fa3e5355 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -481,10 +481,29 @@ RT::PiProgram ProgramManager::getBuiltPIProgram( if (Prg) Prg->stableSerializeSpecConstRegistry(SpecConsts); - auto BuildF = [this, &M, &KSId, &ContextImpl, &DeviceImpl, Prg, &CompileOpts, + // Check if we can optimize program builds for sub-devices by using a program + // built for the root device + DeviceImplPtr RootDevImpl = DeviceImpl; + while (!RootDevImpl->isRootDevice()) { + auto ParentDev = detail::getSyclObjImpl( + RootDevImpl->get_info()); + // Sharing is allowed within a single context only + if (!ContextImpl->hasDevice(ParentDev)) + break; + RootDevImpl = ParentDev; + } + + pi_bool MustBuildOnSubdevice = PI_TRUE; + ContextImpl->getPlugin().call( + RootDevImpl->getHandleRef(), PI_DEVICE_INFO_BUILD_ON_SUBDEVICE, + sizeof(pi_bool), &MustBuildOnSubdevice, nullptr); + + DeviceImplPtr Dev = + (MustBuildOnSubdevice == PI_TRUE) ? DeviceImpl : RootDevImpl; + auto BuildF = [this, &M, &KSId, &ContextImpl, &Dev, Prg, &CompileOpts, &LinkOpts, &JITCompilationIsRequired, SpecConsts] { auto Context = createSyclObjFromImpl(ContextImpl); - auto Device = createSyclObjFromImpl(DeviceImpl); + auto Device = createSyclObjFromImpl(Dev); const RTDeviceBinaryImage &Img = getDeviceImage(M, KSId, Context, Device, JITCompilationIsRequired); @@ -536,7 +555,7 @@ RT::PiProgram ProgramManager::getBuiltPIProgram( return BuiltProgram.release(); }; - const RT::PiDevice PiDevice = DeviceImpl->getHandleRef(); + const RT::PiDevice PiDevice = Dev->getHandleRef(); auto BuildResult = getOrBuild( Cache, diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp index d4aeea252bef2..dc4a3c15ca0bb 100644 --- a/sycl/source/detail/program_manager/program_manager.hpp +++ b/sycl/source/detail/program_manager/program_manager.hpp @@ -108,8 +108,8 @@ class ProgramManager { SerializedObj SpecConsts); /// Builds or retrieves from cache a program defining the kernel with given /// name. - /// \param M idenfies the OS module the kernel comes from (multiple OS modules - /// may have kernels with the same name) + /// \param M identifies the OS module the kernel comes from (multiple OS + /// modules may have kernels with the same name) /// \param Context the context to build the program with /// \param Device the device for which the program is built /// \param KernelName the kernel's name @@ -153,7 +153,7 @@ class ProgramManager { /// \param NativePrg the native program, target for spec constant setting; if /// not null then overrides the native program in Prg /// \param Img A source of the information about which constants need - /// setting and symboling->integer spec constnant ID mapping. If not + /// setting and symboling->integer spec constant ID mapping. If not /// null, overrides native program->binary image binding maintained by /// the program manager. void flushSpecConstants(const program_impl &Prg, diff --git a/sycl/unittests/program_manager/CMakeLists.txt b/sycl/unittests/program_manager/CMakeLists.txt index 7ee79016233f9..d9992d2ea25c9 100644 --- a/sycl/unittests/program_manager/CMakeLists.txt +++ b/sycl/unittests/program_manager/CMakeLists.txt @@ -4,5 +4,6 @@ add_sycl_unittest(ProgramManagerTests OBJECT BuildLog.cpp EliminatedArgMask.cpp itt_annotations.cpp + SubDevices.cpp ) diff --git a/sycl/unittests/program_manager/SubDevices.cpp b/sycl/unittests/program_manager/SubDevices.cpp new file mode 100644 index 0000000000000..2cba00776e42e --- /dev/null +++ b/sycl/unittests/program_manager/SubDevices.cpp @@ -0,0 +1,155 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include +#include +#include + +#include + +#include + +static pi_device rootDevice; +static pi_device piSubDev1 = (pi_device)0x1; +static pi_device piSubDev2 = (pi_device)0x2; + +namespace { +pi_result redefinedDeviceGetInfo(pi_device device, pi_device_info param_name, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) { + if (param_name == PI_DEVICE_INFO_PARTITION_PROPERTIES) { + if (!param_value) { + *param_value_size_ret = 2 * sizeof(pi_device_partition_property); + } else { + ((pi_device_partition_property *)param_value)[0] = + PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN; + ((pi_device_partition_property *)param_value)[1] = + PI_DEVICE_PARTITION_BY_AFFINITY_DOMAIN; + } + } + if (param_name == PI_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN) { + if (!param_value) { + *param_value_size_ret = sizeof(pi_device_affinity_domain); + } else { + ((pi_device_affinity_domain *)param_value)[0] = + PI_DEVICE_AFFINITY_DOMAIN_NUMA | + PI_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE; + } + } + if (param_name == PI_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES) { + ((pi_uint32 *)param_value)[0] = 2; + } + if (param_name == PI_DEVICE_INFO_PARENT_DEVICE) { + if (device == piSubDev1 || device == piSubDev2) + ((pi_device *)param_value)[0] = rootDevice; + else + ((pi_device *)param_value)[0] = nullptr; + } + return PI_SUCCESS; +} + +pi_result redefinedDevicePartition( + pi_device Device, const pi_device_partition_property *Properties, + pi_uint32 NumDevices, pi_device *OutDevices, pi_uint32 *OutNumDevices) { + if (OutNumDevices) + *OutNumDevices = 2; + if (OutDevices) { + OutDevices[0] = {}; + OutDevices[1] = {}; + } + return PI_SUCCESS; +} + +pi_result redefinedDeviceRetain(pi_device c) { return PI_SUCCESS; } + +pi_result redefinedDeviceRelease(pi_device c) { return PI_SUCCESS; } + +pi_result redefinedProgramBuild( + pi_program prog, pi_uint32, const pi_device *, const char *, + void (*pfn_notify)(pi_program program, void *user_data), void *user_data) { + static int m = 0; + m++; + // if called more than once return an error + if (m > 1) + return PI_ERROR_UNKNOWN; + + return PI_SUCCESS; +} + +pi_result redefinedContextCreate(const pi_context_properties *Properties, + pi_uint32 NumDevices, const pi_device *Devices, + void (*PFnNotify)(const char *ErrInfo, + const void *PrivateInfo, + size_t CB, void *UserData), + void *UserData, pi_context *RetContext) { + return PI_SUCCESS; +} +} // anonymous namespace + +// Check that program is built once for all sub-devices +// FIXME: mock 3 devices (one root device + two sub-devices) within a single +// context. +TEST(SubDevices, DISABLED_BuildProgramForSubdevices) { + sycl::platform Plt{sycl::default_selector()}; + // Host devices do not support sub-devices + if (Plt.is_host() || Plt.get_backend() == sycl::backend::ext_oneapi_cuda || + Plt.get_backend() == sycl::backend::ext_oneapi_hip) { + std::cerr << "Test is not supported on " + << Plt.get_info() << ", skipping\n"; + GTEST_SKIP(); // test is not supported on selected platform. + } + + // Setup Mock APIs + sycl::unittest::PiMock Mock{Plt}; + setupDefaultMockAPIs(Mock); + Mock.redefine( + redefinedDeviceGetInfo); + Mock.redefine( + redefinedDevicePartition); + Mock.redefine(redefinedDeviceRetain); + Mock.redefine( + redefinedDeviceRelease); + Mock.redefine(redefinedProgramBuild); + Mock.redefine( + redefinedContextCreate); + + // Create 2 sub-devices and use first platform device as a root device + const sycl::device device = Plt.get_devices()[0]; + // Initialize root device + rootDevice = sycl::detail::getSyclObjImpl(device)->getHandleRef(); + // Initialize sub-devices + auto PltImpl = sycl::detail::getSyclObjImpl(Plt); + auto subDev1 = + std::make_shared(piSubDev1, PltImpl); + auto subDev2 = + std::make_shared(piSubDev2, PltImpl); + sycl::context Ctx{ + {device, sycl::detail::createSyclObjFromImpl(subDev1), + sycl::detail::createSyclObjFromImpl(subDev2)}}; + + // Create device binary description structures for getBuiltPIProgram API. + auto devBin = Img.convertToNativeType(); + pi_device_binaries_struct devBinStruct{PI_DEVICE_BINARIES_VERSION, 1, + &devBin}; + sycl::detail::ProgramManager::getInstance().addImages(&devBinStruct); + + // Build program via getBuiltPIProgram API + sycl::detail::ProgramManager::getInstance().getBuiltPIProgram( + sycl::detail::OSUtil::getOSModuleHandle(&devBin), + sycl::detail::getSyclObjImpl(Ctx), subDev1, + sycl::detail::KernelInfo::getName()); + // This call should re-use built binary from the cache. If piProgramBuild is + // called again, the test will fail as second call of redefinedProgramBuild + sycl::detail::ProgramManager::getInstance().getBuiltPIProgram( + sycl::detail::OSUtil::getOSModuleHandle(&devBin), + sycl::detail::getSyclObjImpl(Ctx), subDev2, + sycl::detail::KernelInfo::getName()); +}