diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h index ad6410fd46def..741e80a106154 100644 --- a/sycl/include/CL/sycl/detail/pi.h +++ b/sycl/include/CL/sycl/detail/pi.h @@ -581,8 +581,9 @@ using _pi_offload_entry = _pi_offload_entry_struct *; // A type of a binary image property. typedef enum { PI_PROPERTY_TYPE_UNKNOWN, - PI_PROPERTY_TYPE_UINT32, // 32-bit integer - PI_PROPERTY_TYPE_STRING // null-terminated string + PI_PROPERTY_TYPE_UINT32, // 32-bit integer + PI_PROPERTY_TYPE_BYTE_ARRAY, // byte array + PI_PROPERTY_TYPE_STRING // null-terminated string } pi_property_type; // Device binary image property. @@ -652,6 +653,8 @@ static const uint8_t PI_DEVICE_BINARY_OFFLOAD_KIND_SYCL = 4; #define PI_PROPERTY_SET_SPEC_CONST_MAP "SYCL/specialization constants" /// PropertySetRegistry::SYCL_DEVICELIB_REQ_MASK defined in PropertySetIO.h #define PI_PROPERTY_SET_DEVICELIB_REQ_MASK "SYCL/devicelib req mask" +/// PropertySetRegistry::SYCL_KERNEL_PARAM_OPT_INFO defined in PropertySetIO.h +#define PI_PROPERTY_SET_KERNEL_PARAM_OPT_INFO "SYCL/kernel param opt" /// This struct is a record of the device binary information. If the Kind field /// denotes a portable binary type (SPIR-V or LLVM IR), the DeviceTargetSpec diff --git a/sycl/include/CL/sycl/detail/pi.hpp b/sycl/include/CL/sycl/detail/pi.hpp index c106c5cba35bd..da3864a31c178 100644 --- a/sycl/include/CL/sycl/detail/pi.hpp +++ b/sycl/include/CL/sycl/detail/pi.hpp @@ -20,8 +20,10 @@ #include #include +#include #include #include +#include #ifdef XPTI_ENABLE_INSTRUMENTATION // Forward declarations @@ -197,6 +199,22 @@ void printArgs(Arg0 arg0, Args... args) { pi::printArgs(std::forward(args)...); } +// A wrapper for passing around byte array properties +class ByteArray { +public: + using ConstIterator = const std::uint8_t *; + + ByteArray(const std::uint8_t *Ptr, std::size_t Size) : Ptr{Ptr}, Size{Size} {} + const std::uint8_t &operator[](std::size_t Idx) const { return Ptr[Idx]; } + std::size_t size() const { return Size; } + ConstIterator begin() const { return Ptr; } + ConstIterator end() const { return Ptr + Size; } + +private: + const std::uint8_t *Ptr; + const std::size_t Size; +}; + // C++ wrapper over the _pi_device_binary_property_struct structure. class DeviceBinaryProperty { public: @@ -204,6 +222,7 @@ class DeviceBinaryProperty { : Prop(Prop) {} pi_uint32 asUint32() const; + ByteArray asByteArray() const; const char *asCString() const; protected: @@ -300,6 +319,9 @@ class DeviceBinaryImage { /// value is 32-bit unsigned integer ID. const PropertyRange &getSpecConstants() const { return SpecConstIDMap; } const PropertyRange &getDeviceLibReqMask() const { return DeviceLibReqMask; } + const PropertyRange &getKernelParamOptInfo() const { + return KernelParamOptInfo; + } virtual ~DeviceBinaryImage() {} protected: @@ -310,6 +332,7 @@ class DeviceBinaryImage { pi::PiDeviceBinaryType Format = PI_DEVICE_BINARY_TYPE_NONE; DeviceBinaryImage::PropertyRange SpecConstIDMap; DeviceBinaryImage::PropertyRange DeviceLibReqMask; + DeviceBinaryImage::PropertyRange KernelParamOptInfo; }; /// Tries to determine the device binary image foramat. Returns diff --git a/sycl/source/detail/pi.cpp b/sycl/source/detail/pi.cpp index 68b9f3cf59a27..578728e389ee8 100644 --- a/sycl/source/detail/pi.cpp +++ b/sycl/source/detail/pi.cpp @@ -416,6 +416,9 @@ std::ostream &operator<<(std::ostream &Out, const DeviceBinaryProperty &P) { case PI_PROPERTY_TYPE_UINT32: Out << "[UINT32] "; break; + case PI_PROPERTY_TYPE_BYTE_ARRAY: + Out << "[Byte array] "; + break; case PI_PROPERTY_TYPE_STRING: Out << "[String] "; break; @@ -429,11 +432,21 @@ std::ostream &operator<<(std::ostream &Out, const DeviceBinaryProperty &P) { case PI_PROPERTY_TYPE_UINT32: Out << P.asUint32(); break; + case PI_PROPERTY_TYPE_BYTE_ARRAY: { + ByteArray BA = P.asByteArray(); + std::ios_base::fmtflags FlagsBackup = Out.flags(); + Out << std::hex; + for (const auto &Byte : BA) { + Out << "0x" << Byte << " "; + } + Out.flags(FlagsBackup); + break; + } case PI_PROPERTY_TYPE_STRING: Out << P.asCString(); break; default: - assert("unsupported property"); + assert(false && "Unsupported property"); return Out; } return Out; @@ -491,6 +504,13 @@ pi_uint32 DeviceBinaryProperty::asUint32() const { return sycl::detail::pi::asUint32(&Prop->ValSize); } +ByteArray DeviceBinaryProperty::asByteArray() const { + assert(Prop->Type == PI_PROPERTY_TYPE_BYTE_ARRAY && "property type mismatch"); + assert(Prop->ValSize > 0 && "property size mismatch"); + const auto *Data = pi::cast(Prop->ValAddr); + return {Data, Prop->ValSize}; +} + const char *DeviceBinaryProperty::asCString() const { assert(Prop->Type == PI_PROPERTY_TYPE_STRING && "property type mismatch"); assert(Prop->ValSize > 0 && "property size mismatch"); @@ -550,6 +570,7 @@ void DeviceBinaryImage::init(pi_device_binary Bin) { SpecConstIDMap.init(Bin, PI_PROPERTY_SET_SPEC_CONST_MAP); DeviceLibReqMask.init(Bin, PI_PROPERTY_SET_DEVICELIB_REQ_MASK); + KernelParamOptInfo.init(Bin, PI_PROPERTY_SET_KERNEL_PARAM_OPT_INFO); } } // namespace pi diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index 89a3485237474..fc45b310c94c6 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -397,6 +398,10 @@ RT::PiProgram ProgramManager::getBuiltPIProgram(OSModuleHandle M, Img.getLinkOptions(), PiDevices, ContextImpl->getCachedLibPrograms(), DeviceLibReqMask); + { + std::lock_guard Lock(MNativeProgramsMutex); + NativePrograms[BuiltProgram.get()] = &Img; + } return BuiltProgram.release(); }; @@ -851,6 +856,23 @@ ProgramManager::build(ProgramPtr Program, const ContextImplPtr Context, return Program; } +static ProgramManager::KernelArgMask +createKernelArgMask(const pi::ByteArray &Bytes) { + const int NBytesForSize = 8; + const int NBitsInElement = 8; + std::uint64_t SizeInBits = 0; + for (int I = 0; I < NBytesForSize; ++I) + SizeInBits |= static_cast(Bytes[I]) << I * NBitsInElement; + + ProgramManager::KernelArgMask Result; + for (std::uint64_t I = 0; I < SizeInBits; ++I) { + std::uint8_t Byte = Bytes[NBytesForSize + (I / NBitsInElement)]; + Result.push_back(Byte & (1 << (I % NBitsInElement))); + } + + return Result; +} + void ProgramManager::addImages(pi_device_binaries DeviceBinary) { std::lock_guard Guard(Sync::getGlobalLock()); @@ -860,6 +882,17 @@ void ProgramManager::addImages(pi_device_binaries DeviceBinary) { const _pi_offload_entry EntriesB = RawImg->EntriesBegin; const _pi_offload_entry EntriesE = RawImg->EntriesEnd; auto Img = make_unique_ptr(RawImg, M); + + // Fill the kernel argument mask map + const pi::DeviceBinaryImage::PropertyRange &KPOIRange = + Img->getKernelParamOptInfo(); + if (KPOIRange.isAvailable()) { + KernelNameToArgMaskMap &ArgMaskMap = + m_EliminatedKernelArgMasks[Img.get()]; + for (const auto &Info : KPOIRange) + ArgMaskMap[Info->Name] = + createKernelArgMask(pi::DeviceBinaryProperty(Info).asByteArray()); + } // Use the entry information if it's available if (EntriesB != EntriesE) { // The kernel sets for any pair of images are either disjoint or @@ -1018,6 +1051,55 @@ uint32_t ProgramManager::getDeviceLibReqMask(const RTDeviceBinaryImage &Img) { return 0xFFFFFFFF; } +// TODO consider another approach with storing the masks in the integration +// header instead. +ProgramManager::KernelArgMask ProgramManager::getEliminatedKernelArgMask( + OSModuleHandle M, const context &Context, pi::PiProgram NativePrg, + const string_class &KernelName, bool KnownProgram) { + // If instructed to use a spv file, assume no eliminated arguments. + if (m_UseSpvFile && M == OSUtil::ExeModuleHandle) + return {}; + + { + std::lock_guard Lock(MNativeProgramsMutex); + auto ImgIt = NativePrograms.find(NativePrg); + if (ImgIt != NativePrograms.end()) { + auto MapIt = m_EliminatedKernelArgMasks.find(ImgIt->second); + if (MapIt != m_EliminatedKernelArgMasks.end()) + return MapIt->second[KernelName]; + return {}; + } + } + + if (KnownProgram) + throw runtime_error("Program is not associated with a binary image", + PI_INVALID_VALUE); + + // If not sure whether the program was built with one of the images, try + // finding the binary. + // TODO this can backfire in some extreme edge cases where there's a kernel + // name collision between our binaries and user-created native programs. + KernelSetId KSId; + try { + KSId = getKernelSetId(M, KernelName); + } catch (sycl::runtime_error &e) { + // If the kernel name wasn't found, assume that the program wasn't created + // from one of our device binary images. + if (e.get_cl_code() == PI_INVALID_KERNEL_NAME) + return {}; + std::rethrow_exception(std::current_exception()); + } + RTDeviceBinaryImage &Img = getDeviceImage(M, KSId, Context); + { + std::lock_guard Lock(MNativeProgramsMutex); + NativePrograms[NativePrg] = &Img; + } + auto MapIt = m_EliminatedKernelArgMasks.find(&Img); + if (MapIt != m_EliminatedKernelArgMasks.end()) + return MapIt->second[KernelName]; + return {}; +} + } // namespace detail } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp index 4a7b1045054bb..1e4329b3fd925 100644 --- a/sycl/source/detail/program_manager/program_manager.hpp +++ b/sycl/source/detail/program_manager/program_manager.hpp @@ -58,6 +58,9 @@ enum class DeviceLibExt : std::uint32_t { // that is necessary for no interoperability cases with lambda. class ProgramManager { public: + // TODO use a custom dynamic bitset instead to make initialization simpler. + using KernelArgMask = std::vector; + // Returns the single instance of the program manager for the entire // process. Can only be called after staticInit is done. static ProgramManager &getInstance(); @@ -110,6 +113,22 @@ class ProgramManager { const RTDeviceBinaryImage *Img = nullptr); uint32_t getDeviceLibReqMask(const RTDeviceBinaryImage &Img); + /// Returns the mask for eliminated kernel arguments for the requested kernel + /// within the native program. + /// \param M identifies the OS module the kernel comes from (multiple OS + /// modules may have kernels with the same name). + /// \param Context the context associated with the kernel. + /// \param NativePrg the PI program associated with the kernel. + /// \param KernelName the name of the kernel. + /// \param KnownProgram indicates whether the PI program is guaranteed to + /// be known to program manager (built with its API) or not (not + /// cacheable or constructed with interoperability). + KernelArgMask getEliminatedKernelArgMask(OSModuleHandle M, + const context &Context, + pi::PiProgram NativePrg, + const string_class &KernelName, + bool KnownProgram); + private: ProgramManager(); ~ProgramManager() = default; @@ -175,6 +194,8 @@ class ProgramManager { // - knowing which specialization constants are used in the program and // injecting their current values before compiling the SPIRV; the binary // image object has info about all spec constants used in the module + // - finding kernel argument masks for kernels associated with each + // pi_program // NOTE: using RTDeviceBinaryImage raw pointers is OK, since they are not // referenced from outside SYCL runtime and RTDeviceBinaryImage object // lifetime matches program manager's one. @@ -186,6 +207,14 @@ class ProgramManager { /// Protects NativePrograms that can be changed by class' methods. std::mutex MNativeProgramsMutex; + + using KernelNameToArgMaskMap = + std::unordered_map; + /// Maps binary image and kernel name pairs to kernel argument masks which + /// specify which arguments were eliminated during device code optimization. + std::unordered_map + m_EliminatedKernelArgMasks; + /// True iff a SPIRV file has been specified with an environment variable bool m_UseSpvFile = false; }; diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index d5cc7dd8a7da9..efbe0373f06e6 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -1647,9 +1647,28 @@ static void ReverseRangeDimensionsForKernel(NDRDescT &NDR) { pi_result ExecCGCommand::SetKernelParamsAndLaunch( CGExecKernel *ExecKernel, RT::PiKernel Kernel, NDRDescT &NDRDesc, - std::vector &RawEvents, RT::PiEvent &Event) { + std::vector &RawEvents, RT::PiEvent &Event, + ProgramManager::KernelArgMask EliminatedArgMask) { + vector_class &Args = ExecKernel->MArgs; + // TODO this is not necessary as long as we can guarantee that the arguments + // are already sorted (e. g. handle the sorting in handler if necessary due + // to set_arg(...) usage). + std::sort(Args.begin(), Args.end(), [](const ArgDesc &A, const ArgDesc &B) { + return A.MIndex < B.MIndex; + }); + int LastIndex = -1; + int NextTrueIndex = 0; const detail::plugin &Plugin = MQueue->getPlugin(); for (ArgDesc &Arg : ExecKernel->MArgs) { + // Handle potential gaps in set arguments (e. g. if some of them are set + // on the user side). + for (int Idx = LastIndex + 1; Idx < Arg.MIndex; ++Idx) + if (EliminatedArgMask.empty() || !EliminatedArgMask[Idx]) + ++NextTrueIndex; + LastIndex = Arg.MIndex; + + if (!EliminatedArgMask.empty() && EliminatedArgMask[Arg.MIndex]) + continue; switch (Arg.MType) { case kernel_param_kind_t::kind_accessor: { Requirement *Req = (Requirement *)(Arg.MPtr); @@ -1658,16 +1677,16 @@ pi_result ExecCGCommand::SetKernelParamsAndLaunch( ? (RT::PiMem)AllocaCmd->ESIMDExt.MWrapperImage : (RT::PiMem)AllocaCmd->getMemAllocation(); if (Plugin.getBackend() == backend::opencl) { - Plugin.call(Kernel, Arg.MIndex, + Plugin.call(Kernel, NextTrueIndex, sizeof(RT::PiMem), &MemArg); } else { - Plugin.call(Kernel, Arg.MIndex, + Plugin.call(Kernel, NextTrueIndex, &MemArg); } break; } case kernel_param_kind_t::kind_std_layout: { - Plugin.call(Kernel, Arg.MIndex, Arg.MSize, + Plugin.call(Kernel, NextTrueIndex, Arg.MSize, Arg.MPtr); break; } @@ -1675,16 +1694,17 @@ pi_result ExecCGCommand::SetKernelParamsAndLaunch( sampler *SamplerPtr = (sampler *)Arg.MPtr; RT::PiSampler Sampler = detail::getSyclObjImpl(*SamplerPtr) ->getOrCreateSampler(MQueue->get_context()); - Plugin.call(Kernel, Arg.MIndex, + Plugin.call(Kernel, NextTrueIndex, &Sampler); break; } case kernel_param_kind_t::kind_pointer: { - Plugin.call(Kernel, Arg.MIndex, + Plugin.call(Kernel, NextTrueIndex, Arg.MSize, Arg.MPtr); break; } } + ++NextTrueIndex; } adjustNDRangePerKernel(NDRDesc, Kernel, @@ -1883,6 +1903,8 @@ cl_int ExecCGCommand::enqueueImp() { sycl::context Context = MQueue->get_context(); RT::PiKernel Kernel = nullptr; std::mutex *KernelMutex = nullptr; + RT::PiProgram Program = nullptr; + bool KnownProgram = true; if (nullptr != ExecKernel->MSyclKernel) { assert(ExecKernel->MSyclKernel->get_info() == @@ -1891,6 +1913,7 @@ cl_int ExecCGCommand::enqueueImp() { auto SyclProg = detail::getSyclObjImpl( ExecKernel->MSyclKernel->get_info()); + Program = SyclProg->getHandleRef(); if (SyclProg->is_cacheable()) { RT::PiKernel FoundKernel = nullptr; std::tie(FoundKernel, KernelMutex) = @@ -1899,23 +1922,35 @@ cl_int ExecCGCommand::enqueueImp() { ExecKernel->MSyclKernel->get_info(), ExecKernel->MKernelName, SyclProg.get()); assert(FoundKernel == Kernel); - } + } else + KnownProgram = false; } else { std::tie(Kernel, KernelMutex) = detail::ProgramManager::getInstance().getOrCreateKernel( ExecKernel->MOSModuleHandle, Context, ExecKernel->MKernelName, nullptr); + MQueue->getPlugin().call( + Kernel, PI_KERNEL_INFO_PROGRAM, sizeof(RT::PiProgram), &Program, + nullptr); } pi_result Error = PI_SUCCESS; + ProgramManager::KernelArgMask EliminatedArgMask; + if (nullptr == ExecKernel->MSyclKernel || + !ExecKernel->MSyclKernel->isCreatedFromSource()) { + EliminatedArgMask = + detail::ProgramManager::getInstance().getEliminatedKernelArgMask( + ExecKernel->MOSModuleHandle, Context, Program, + ExecKernel->MKernelName, KnownProgram); + } if (KernelMutex != nullptr) { // For cacheable kernels, we use per-kernel mutex std::lock_guard Lock(*KernelMutex); Error = SetKernelParamsAndLaunch(ExecKernel, Kernel, NDRDesc, RawEvents, - Event); + Event, EliminatedArgMask); } else { Error = SetKernelParamsAndLaunch(ExecKernel, Kernel, NDRDesc, RawEvents, - Event); + Event, EliminatedArgMask); } if (PI_SUCCESS != Error) { diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index c42d418fa8269..0936de076a6cc 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -19,6 +19,7 @@ #include #include #include +#include __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { @@ -499,10 +500,10 @@ class ExecCGCommand : public Command { AllocaCommandBase *getAllocaForReq(Requirement *Req); - pi_result SetKernelParamsAndLaunch(CGExecKernel *ExecKernel, - RT::PiKernel Kernel, NDRDescT &NDRDesc, - std::vector &RawEvents, - RT::PiEvent &Event); + pi_result SetKernelParamsAndLaunch( + CGExecKernel *ExecKernel, RT::PiKernel Kernel, NDRDescT &NDRDesc, + std::vector &RawEvents, RT::PiEvent &Event, + ProgramManager::KernelArgMask EliminatedArgMask); std::unique_ptr MCommandGroup; diff --git a/sycl/test/device_code_dae/device_code_dae.cpp b/sycl/test/device_code_dae/device_code_dae.cpp new file mode 100644 index 0000000000000..52eb03d9bb4fc --- /dev/null +++ b/sycl/test/device_code_dae/device_code_dae.cpp @@ -0,0 +1,73 @@ +// NOTE A temporary test before this compilation flow is enabled by default in +// driver +// UNSUPPORTED: cuda +// CUDA does not support SPIR-V. +// RUN: %clangxx -fsycl-device-only -Xclang -fenable-sycl-dae -Xclang -fsycl-int-header=int_header.h %s -c -o device_code.bc -I %sycl_include -Wno-sycl-strict +// RUN: %clangxx -include int_header.h -g -c %s -o host_code.o -I %sycl_include -Wno-sycl-strict +// RUN: llvm-link -o=linked_device_code.bc device_code.bc +// RUN: sycl-post-link -emit-param-info linked_device_code.bc +// RUN: llvm-spirv -o linked_device_code.spv linked_device_code.bc +// RUN: echo -e -n "[Code|Properties]\nlinked_device_code.spv|linked_device_code_0.prop" > table.txt +// RUN: clang-offload-wrapper -o wrapper.bc -host=x86_64 -kind=sycl -target=spir64 -batch table.txt +// RUN: %clangxx -c wrapper.bc -o wrapper.o +// RUN: %clangxx wrapper.o host_code.o -o app.exe -lsycl +// RUN: env SYCL_BE=%sycl_be ./app.exe + +//==---------device_code_dae.cpp - dead argument elimination test ----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#include + +class KernelNameA; +class KernelNameB; +class KernelNameC; +using namespace cl::sycl; + +void verifyAndReset(buffer buf, int expected) { + auto acc = buf.get_access(); + assert(acc[0] == expected); + acc[0] = 0; +} + +int main() { + buffer buf{range<1>(1)}; + int gold = 42; + queue q; + + q.submit([&](handler &cgh) { + auto acc = buf.get_access(cgh); + cgh.single_task([=]() { acc[0] = gold; }); + }); + + verifyAndReset(buf, gold); + + // Check usage of program class + program prgB{q.get_context()}; + prgB.build_with_kernel_type(); + kernel krnB = prgB.get_kernel(); + q.submit([&](handler &cgh) { + auto acc = buf.get_access(cgh); + cgh.single_task(krnB, [=]() { acc[0] = gold; }); + }); + + verifyAndReset(buf, gold); + + // Check the non-cacheable case + program prgC{q.get_context()}; + prgC.compile_with_kernel_type(); + prgC.link(); + kernel krnC = prgC.get_kernel(); + q.submit([&](handler &cgh) { + auto acc = buf.get_access(cgh); + cgh.single_task(krnC, [=]() { acc[0] = gold; }); + }); + + verifyAndReset(buf, gold); +}