diff --git a/sycl/source/backend.cpp b/sycl/source/backend.cpp index 691bdbf0ab91..37407c35e61d 100644 --- a/sycl/source/backend.cpp +++ b/sycl/source/backend.cpp @@ -187,7 +187,7 @@ make_kernel_bundle(pi_native_handle NativeHandle, const context &TargetContext, // this by pre-building the device image and extracting kernel info. We can't // do the same to user images, since they may contain references to undefined // symbols (e.g. when kernel_bundle is supposed to be joined with another). - std::vector KernelIDs{}; + auto KernelIDs = std::make_shared>(); auto DevImgImpl = std::make_shared( nullptr, TargetContext, Devices, State, KernelIDs, PiProgram); device_image_plain DevImg{DevImgImpl}; diff --git a/sycl/source/detail/device_image_impl.hpp b/sycl/source/detail/device_image_impl.hpp index 42310d97d3db..4886c5c8ae25 100644 --- a/sycl/source/detail/device_image_impl.hpp +++ b/sycl/source/detail/device_image_impl.hpp @@ -32,6 +32,12 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { namespace detail { +template struct LessByHash { + bool operator()(const T &LHS, const T &RHS) const { + return getSyclObjImpl(LHS) < getSyclObjImpl(RHS); + } +}; + // The class is impl counterpart for sycl::device_image // It can represent a program in different states, kernel_id's it has and state // of specialization constants for it @@ -51,7 +57,8 @@ class device_image_impl { device_image_impl(const RTDeviceBinaryImage *BinImage, context Context, std::vector Devices, bundle_state State, - std::vector KernelIDs, RT::PiProgram Program) + std::shared_ptr> KernelIDs, + RT::PiProgram Program) : MBinImage(BinImage), MContext(std::move(Context)), MDevices(std::move(Devices)), MState(State), MProgram(Program), MKernelIDs(std::move(KernelIDs)) { @@ -60,8 +67,8 @@ class device_image_impl { device_image_impl(const RTDeviceBinaryImage *BinImage, context Context, std::vector Devices, bundle_state State, - std::vector KernelIDs, RT::PiProgram Program, - const SpecConstMapT &SpecConstMap, + std::shared_ptr> KernelIDs, + RT::PiProgram Program, const SpecConstMapT &SpecConstMap, const std::vector &SpecConstsBlob) : MBinImage(BinImage), MContext(std::move(Context)), MDevices(std::move(Devices)), MState(State), MProgram(Program), @@ -69,8 +76,8 @@ class device_image_impl { MSpecConstSymMap(SpecConstMap) {} bool has_kernel(const kernel_id &KernelIDCand) const noexcept { - return std::binary_search(MKernelIDs.begin(), MKernelIDs.end(), - KernelIDCand, LessByNameComp{}); + return std::binary_search(MKernelIDs->begin(), MKernelIDs->end(), + KernelIDCand, LessByHash{}); } bool has_kernel(const kernel_id &KernelIDCand, @@ -83,7 +90,7 @@ class device_image_impl { } const std::vector &get_kernel_ids() const noexcept { - return MKernelIDs; + return *MKernelIDs; } bool has_specialization_constants() const noexcept { @@ -176,7 +183,9 @@ class device_image_impl { const context &get_context() const noexcept { return MContext; } - std::vector &get_kernel_ids_ref() noexcept { return MKernelIDs; } + std::shared_ptr> &get_kernel_ids_ptr() noexcept { + return MKernelIDs; + } std::vector &get_spec_const_blob_ref() noexcept { return MSpecConstsBlob; @@ -312,7 +321,7 @@ class device_image_impl { RT::PiProgram MProgram = nullptr; // List of kernel ids available in this image, elements should be sorted // according to LessByNameComp - std::vector MKernelIDs; + std::shared_ptr> MKernelIDs; // A mutex for sycnhronizing access to spec constants blob. Mutable because // needs to be locked in the const method for getting spec constant value. diff --git a/sycl/source/detail/kernel_bundle_impl.hpp b/sycl/source/detail/kernel_bundle_impl.hpp index a6828b6640e0..cc8bf4b38249 100644 --- a/sycl/source/detail/kernel_bundle_impl.hpp +++ b/sycl/source/detail/kernel_bundle_impl.hpp @@ -28,12 +28,6 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { namespace detail { -template struct LessByHash { - bool operator()(const T &LHS, const T &RHS) const { - return getSyclObjImpl(LHS) < getSyclObjImpl(RHS); - } -}; - static bool checkAllDevicesAreInContext(const std::vector &Devices, const context &Context) { const std::vector &ContextDevices = Context.get_devices(); diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index 7ba084f31589..f7659172dbc5 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -1070,6 +1070,59 @@ void ProgramManager::addImages(pi_device_binaries DeviceBinary) { ArgMaskMap[Info->Name] = createKernelArgMask(pi::DeviceBinaryProperty(Info).asByteArray()); } + + // Fill maps for kernel bundles + if (EntriesB != EntriesE) { + std::lock_guard KernelIDsGuard(m_KernelIDsMutex); + + // Register all exported symbols + auto ExportedSymbols = Img->getExportedSymbols(); + for (const pi_device_binary_property &ExportedSymbol : ExportedSymbols) + m_ExportedSymbols.insert(ExportedSymbol->Name); + + m_BinImg2KernelIDs[Img.get()].reset(new std::vector); + + for (_pi_offload_entry EntriesIt = EntriesB; EntriesIt != EntriesE; + ++EntriesIt) { + + // Skip creating unique kernel ID if it is a service kernel. + // SYCL service kernels are identified by having + // __sycl_service_kernel__ in the mangled name, primarily as part of + // the namespace of the name type. + if (std::strstr(EntriesIt->name, "__sycl_service_kernel__")) { + m_ServiceKernels.insert(EntriesIt->name); + continue; + } + + // Skip creating unique kernel ID if it is an exported device + // function. Exported device functions appear in the offload entries + // among kernels, but are identifiable by being listed in properties. + if (m_ExportedSymbols.find(EntriesIt->name) != m_ExportedSymbols.end()) + continue; + + // ... and create a unique kernel ID for the entry + auto It = m_KernelName2KernelIDs.find(EntriesIt->name); + if (It == m_KernelName2KernelIDs.end()) { + std::shared_ptr KernelIDImpl = + std::make_shared(EntriesIt->name); + sycl::kernel_id KernelID = + detail::createSyclObjFromImpl(KernelIDImpl); + + It = m_KernelName2KernelIDs.emplace_hint(It, EntriesIt->name, + KernelID); + } + + m_KernelIDs2BinImage.insert(std::make_pair(It->second, Img.get())); + m_BinImg2KernelIDs[Img.get()]->push_back(It->second); + } + + // Sort kernel ids for faster search + std::sort(m_BinImg2KernelIDs[Img.get()]->begin(), + m_BinImg2KernelIDs[Img.get()]->end(), LessByHash{}); + } + + // TODO: Remove the code below once program manager works trought kernel + // bundles only // Use the entry information if it's available if (EntriesB != EntriesE) { // The kernel sets for any pair of images are either disjoint or @@ -1094,40 +1147,9 @@ void ProgramManager::addImages(pi_device_binaries DeviceBinary) { { std::lock_guard KernelIDsGuard(m_KernelIDsMutex); - // Register all exported symbols - auto ExportedSymbols = Img->getExportedSymbols(); - for (const pi_device_binary_property &ExportedSymbol : ExportedSymbols) - m_ExportedSymbols.insert(ExportedSymbol->Name); - for (_pi_offload_entry EntriesIt = EntriesB; EntriesIt != EntriesE; ++EntriesIt) { - auto Result = KSIdMap.insert(std::make_pair(EntriesIt->name, KSId)); - (void)Result; - assert(Result.second && "Kernel sets are not disjoint"); - - // Skip creating unique kernel ID if it is a service kernel. - // SYCL service kernels are identified by having - // __sycl_service_kernel__ in the mangled name, primarily as part of - // the namespace of the name type. - if (std::strstr(EntriesIt->name, "__sycl_service_kernel__")) { - m_ServiceKernels.insert(EntriesIt->name); - continue; - } - - // Skip creating unique kernel ID if it is an exported device - // function. Exported device functions appear in the offload entries - // among kernels, but are identifiable by being listed in properties. - if (m_ExportedSymbols.find(EntriesIt->name) != - m_ExportedSymbols.end()) - continue; - - // ... and create a unique kernel ID for the entry - std::shared_ptr KernelIDImpl = - std::make_shared(EntriesIt->name); - sycl::kernel_id KernelID = - detail::createSyclObjFromImpl(KernelIDImpl); - m_KernelIDs.insert( - std::make_pair(EntriesIt->name, std::move(KernelID))); + KSIdMap.insert(std::make_pair(EntriesIt->name, KSId)); } } // ... and initialize associated device_global information @@ -1373,8 +1395,8 @@ static bool compatibleWithDevice(RTDeviceBinaryImage *BinImage, kernel_id ProgramManager::getSYCLKernelID(const std::string &KernelName) { std::lock_guard KernelIDsGuard(m_KernelIDsMutex); - auto KernelID = m_KernelIDs.find(KernelName); - if (KernelID == m_KernelIDs.end()) + auto KernelID = m_KernelName2KernelIDs.find(KernelName); + if (KernelID == m_KernelName2KernelIDs.end()) throw runtime_error("No kernel found with the specified name", PI_INVALID_KERNEL_NAME); @@ -1385,8 +1407,8 @@ std::vector ProgramManager::getAllSYCLKernelIDs() { std::lock_guard KernelIDsGuard(m_KernelIDsMutex); std::vector AllKernelIDs; - AllKernelIDs.reserve(m_KernelIDs.size()); - for (std::pair KernelID : m_KernelIDs) { + AllKernelIDs.reserve(m_KernelName2KernelIDs.size()); + for (std::pair KernelID : m_KernelName2KernelIDs) { AllKernelIDs.push_back(KernelID.second); } return AllKernelIDs; @@ -1417,80 +1439,58 @@ void ProgramManager::addDeviceGlobalEntry(void *DeviceGlobalPtr, std::vector ProgramManager::getSYCLDeviceImagesWithCompatibleState( const context &Ctx, const std::vector &Devs, - bundle_state TargetState) { - - // Collect raw device images - std::vector BinImages; - { + bundle_state TargetState, const std::vector &KernelIDs) { + + // Collect unique raw device images taking into account kernel ids passed + // TODO: Can we avoid repacking? + std::set BinImages; + if (!KernelIDs.empty()) { + std::lock_guard KernelIDsGuard(m_KernelIDsMutex); + for (const kernel_id &KID : KernelIDs) { + auto Range = m_KernelIDs2BinImage.equal_range(KID); + for (auto It = Range.first, End = Range.second; It != End; ++It) + BinImages.insert(It->second); + } + } else { std::lock_guard Guard(Sync::getGlobalLock()); for (auto &ImagesSets : m_DeviceImages) { auto &ImagesUPtrs = *ImagesSets.second.get(); - for (auto &ImageUPtr : ImagesUPtrs) { - const RTDeviceBinaryImage *BinImage = ImageUPtr.get(); - const bundle_state ImgState = getBinImageState(BinImage); - - // Ignore images with incompatible state. Image is considered compatible - // with a target state if an image is already in the target state or can - // be brought to target state by compiling/linking/building. - // - // Example: an image in "executable" state is not compatible with - // "input" target state - there is no operation to convert the image it - // to "input" state. An image in "input" state is compatible with - // "executable" target state because it can be built to get into - // "executable" state. - if (ImgState > TargetState) - continue; - - BinImages.push_back(ImageUPtr.get()); - } + for (auto &ImageUPtr : ImagesUPtrs) + BinImages.insert(ImageUPtr.get()); } } - // TODO: Add a diagnostic on multiple device images with conflicting kernel - // names, and remove OSModuleHandle usage, as conflicting kernel names will be - // an error. + assert(BinImages.size() > 0 && "Expected to find at least on device image"); - // TODO: Cache device_image objects - // Create SYCL device image from those that have compatible state and at least - // one device std::vector SYCLDeviceImages; for (RTDeviceBinaryImage *BinImage : BinImages) { const bundle_state ImgState = getBinImageState(BinImage); + // Ignore images with incompatible state. Image is considered compatible + // with a target state if an image is already in the target state or can + // be brought to target state by compiling/linking/building. + // + // Example: an image in "executable" state is not compatible with + // "input" target state - there is no operation to convert the image it + // to "input" state. An image in "input" state is compatible with + // "executable" target state because it can be built to get into + // "executable" state. + if (ImgState > TargetState) + continue; + for (const sycl::device &Dev : Devs) { if (!compatibleWithDevice(BinImage, Dev)) continue; - std::vector KernelIDs; + std::shared_ptr> KernelIDs; // Collect kernel names for the image - pi_device_binary DevBin = - const_cast(&BinImage->getRawData()); { std::lock_guard KernelIDsGuard(m_KernelIDsMutex); - for (_pi_offload_entry EntriesIt = DevBin->EntriesBegin; - EntriesIt != DevBin->EntriesEnd; ++EntriesIt) { - auto KernelID = m_KernelIDs.find(EntriesIt->name); - - if (KernelID == m_KernelIDs.end()) { - // Service kernels and exported symbols do not have kernel IDs - assert((m_ServiceKernels.find(EntriesIt->name) != - m_ServiceKernels.end() || - m_ExportedSymbols.find(EntriesIt->name) != - m_ExportedSymbols.end()) && - "Kernel ID in device binary missing from cache"); - continue; - } - - KernelIDs.push_back(KernelID->second); - } + KernelIDs = m_BinImg2KernelIDs[BinImage]; + // If the image does not contain any non-service kernels we can skip it. + if (KernelIDs->empty()) + continue; } - // If the image does not contain any non-service kernels we can skip it. - if (KernelIDs.empty()) - continue; - - // device_image_impl expects kernel ids to be sorted for fast search - std::sort(KernelIDs.begin(), KernelIDs.end(), LessByNameComp{}); - DeviceImageImplPtr Impl = std::make_shared( BinImage, Ctx, Devs, ImgState, KernelIDs, /*PIProgram=*/nullptr); @@ -1591,8 +1591,9 @@ std::vector ProgramManager::getSYCLDeviceImages( { std::lock_guard BuiltInKernelIDsGuard(m_BuiltInKernelIDsMutex); - for (const kernel_id &ID : KernelIDs) { - if (m_BuiltInKernelIDs.find(ID.get_name()) != m_BuiltInKernelIDs.end()) + for (auto &It : m_BuiltInKernelIDs) { + if (std::find(KernelIDs.begin(), KernelIDs.end(), It.second) != + KernelIDs.end()) throw sycl::exception(make_error_code(errc::kernel_argument), "Attempting to use a built-in kernel. They are " "not fully supported"); @@ -1601,19 +1602,7 @@ std::vector ProgramManager::getSYCLDeviceImages( // Collect device images with compatible state std::vector DeviceImages = - getSYCLDeviceImagesWithCompatibleState(Ctx, Devs, TargetState); - - // Filter out images that have no kernel_ids specified - auto It = std::remove_if(DeviceImages.begin(), DeviceImages.end(), - [&KernelIDs](const device_image_plain &Image) { - return std::none_of( - KernelIDs.begin(), KernelIDs.end(), - [&Image](const sycl::kernel_id &KernelID) { - return Image.has_kernel(KernelID); - }); - }); - - DeviceImages.erase(It, DeviceImages.end()); + getSYCLDeviceImagesWithCompatibleState(Ctx, Devs, TargetState, KernelIDs); // Brind device images with compatible state to desired state bringSYCLDeviceImagesToState(DeviceImages, TargetState); @@ -1654,7 +1643,7 @@ ProgramManager::compile(const device_image_plain &DeviceImage, DeviceImageImplPtr ObjectImpl = std::make_shared( InputImpl->get_bin_image_ref(), InputImpl->get_context(), Devs, - bundle_state::object, InputImpl->get_kernel_ids_ref(), Prog, + bundle_state::object, InputImpl->get_kernel_ids_ptr(), Prog, InputImpl->get_spec_const_data_ref(), InputImpl->get_spec_const_blob_ref()); @@ -1718,15 +1707,16 @@ ProgramManager::link(const std::vector &DeviceImages, Plugin.reportPiError(Error, "link()"); } - std::vector KernelIDs; + std::shared_ptr> KernelIDs{new std::vector}; for (const device_image_plain &DeviceImage : DeviceImages) { // Duplicates are not expected here, otherwise piProgramLink should fail - KernelIDs.insert(KernelIDs.end(), - getSyclObjImpl(DeviceImage)->get_kernel_ids().begin(), - getSyclObjImpl(DeviceImage)->get_kernel_ids().end()); + KernelIDs->insert( + KernelIDs->end(), + getSyclObjImpl(DeviceImage)->get_kernel_ids_ptr()->begin(), + getSyclObjImpl(DeviceImage)->get_kernel_ids_ptr()->end()); } // device_image_impl expects kernel ids to be sorted for fast search - std::sort(KernelIDs.begin(), KernelIDs.end(), LessByNameComp{}); + std::sort(KernelIDs->begin(), KernelIDs->end(), LessByHash{}); DeviceImageImplPtr ExecutableImpl = std::make_shared( @@ -1895,7 +1885,7 @@ device_image_plain ProgramManager::build(const device_image_plain &DeviceImage, DeviceImageImplPtr ExecImpl = std::make_shared( InputImpl->get_bin_image_ref(), Context, Devs, bundle_state::executable, - InputImpl->get_kernel_ids_ref(), ResProgram, + InputImpl->get_kernel_ids_ptr(), ResProgram, InputImpl->get_spec_const_data_ref(), InputImpl->get_spec_const_blob_ref()); diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp index fbcc3bd8cb44..b50dd808787b 100644 --- a/sycl/source/detail/program_manager/program_manager.hpp +++ b/sycl/source/detail/program_manager/program_manager.hpp @@ -188,10 +188,9 @@ class ProgramManager { // The function returns a vector of SYCL device images that are compiled with // the required state and at least one device from the passed list of devices. - std::vector - getSYCLDeviceImagesWithCompatibleState(const context &Ctx, - const std::vector &Devs, - bundle_state TargetState); + std::vector getSYCLDeviceImagesWithCompatibleState( + const context &Ctx, const std::vector &Devs, + bundle_state TargetState, const std::vector &KernelIDs = {}); // Brind images in the passed vector to the required state. Does it inplace void @@ -314,7 +313,23 @@ class ProgramManager { /// TODO: Use std::unordered_set with transparent hash and equality functions /// when C++20 is enabled for the runtime library. /// Access must be guarded by the m_KernelIDsMutex mutex. - std::unordered_map m_KernelIDs; + // + std::unordered_map m_KernelName2KernelIDs; + + // Maps KernelIDs to device binary images. There can be more than one image + // in case of SPIRV + AOT. + /// Access must be guarded by the m_KernelIDsMutex mutex. + std::unordered_multimap + m_KernelIDs2BinImage; + + // Maps device binary image to a vector of kernel ids in this image. + // Using shared_ptr to avoid expensive copy of the vector. + // The vector is initialized in addImages function and is supposed to be + // immutable afterwards. + /// Access must be guarded by the m_KernelIDsMutex mutex. + std::unordered_map>> + m_BinImg2KernelIDs; /// Protects kernel ID cache. /// NOTE: This may be acquired while \ref Sync::getGlobalLock() is held so to diff --git a/sycl/source/kernel_bundle.cpp b/sycl/source/kernel_bundle.cpp index d196764ca670..21f308883bbf 100644 --- a/sycl/source/kernel_bundle.cpp +++ b/sycl/source/kernel_bundle.cpp @@ -218,8 +218,8 @@ bool has_kernel_bundle_impl(const context &Ctx, const std::vector &Devs, const std::shared_ptr &DeviceImageImpl = getSyclObjImpl(DeviceImage); - CombinedKernelIDs.insert(DeviceImageImpl->get_kernel_ids_ref().begin(), - DeviceImageImpl->get_kernel_ids_ref().end()); + CombinedKernelIDs.insert(DeviceImageImpl->get_kernel_ids_ptr()->begin(), + DeviceImageImpl->get_kernel_ids_ptr()->end()); } const bool AllKernelIDsRepresented = diff --git a/sycl/unittests/program_manager/EliminatedArgMask.cpp b/sycl/unittests/program_manager/EliminatedArgMask.cpp index c5c3bfe4c2c6..5301ea986ad9 100644 --- a/sycl/unittests/program_manager/EliminatedArgMask.cpp +++ b/sycl/unittests/program_manager/EliminatedArgMask.cpp @@ -167,11 +167,10 @@ sycl::detail::ProgramManager::KernelArgMask getKernelArgMaskFromBundle( EXPECT_TRUE(KernelBundleImplPtr) << "Expect command group to contain kernel bundle"; - auto KernelIDImpl = - std::make_shared(ExecKernel->MKernelName); - sycl::kernel SyclKernel = KernelBundleImplPtr->get_kernel( - sycl::detail::createSyclObjFromImpl(KernelIDImpl), - KernelBundleImplPtr); + auto KernelID = sycl::detail::ProgramManager::getInstance().getSYCLKernelID( + ExecKernel->MKernelName); + sycl::kernel SyclKernel = + KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr); auto SyclKernelImpl = sycl::detail::getSyclObjImpl(SyclKernel); std::shared_ptr DeviceImageImpl = SyclKernelImpl->getDeviceImage();