From 8d3c5a7b546f30f9c2caa52797566556f0ef50cd Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Wed, 12 Aug 2020 12:59:18 +0300 Subject: [PATCH 1/5] [SYCL] Add runtime support for device code argument elimination Add support for the byte array type properties. Register the new kernel parameter optimization properties while adding binary images in the program manager. This information is then used while setting kernel arguments to skip over the eliminated ones. Signed-off-by: Sergey Semenov --- sycl/include/CL/sycl/detail/pi.h | 7 +- sycl/include/CL/sycl/detail/pi.hpp | 6 ++ sycl/source/detail/pi.cpp | 31 ++++++-- .../program_manager/program_manager.cpp | 74 +++++++++++++++++++ .../program_manager/program_manager.hpp | 28 +++++++ sycl/source/detail/scheduler/commands.cpp | 44 ++++++++--- sycl/source/detail/scheduler/commands.hpp | 9 ++- 7 files changed, 176 insertions(+), 23 deletions(-) diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h index ad6410fd46def..741e80a106154 100644 --- a/sycl/include/CL/sycl/detail/pi.h +++ b/sycl/include/CL/sycl/detail/pi.h @@ -581,8 +581,9 @@ using _pi_offload_entry = _pi_offload_entry_struct *; // A type of a binary image property. typedef enum { PI_PROPERTY_TYPE_UNKNOWN, - PI_PROPERTY_TYPE_UINT32, // 32-bit integer - PI_PROPERTY_TYPE_STRING // null-terminated string + PI_PROPERTY_TYPE_UINT32, // 32-bit integer + PI_PROPERTY_TYPE_BYTE_ARRAY, // byte array + PI_PROPERTY_TYPE_STRING // null-terminated string } pi_property_type; // Device binary image property. @@ -652,6 +653,8 @@ static const uint8_t PI_DEVICE_BINARY_OFFLOAD_KIND_SYCL = 4; #define PI_PROPERTY_SET_SPEC_CONST_MAP "SYCL/specialization constants" /// PropertySetRegistry::SYCL_DEVICELIB_REQ_MASK defined in PropertySetIO.h #define PI_PROPERTY_SET_DEVICELIB_REQ_MASK "SYCL/devicelib req mask" +/// PropertySetRegistry::SYCL_KERNEL_PARAM_OPT_INFO defined in PropertySetIO.h +#define PI_PROPERTY_SET_KERNEL_PARAM_OPT_INFO "SYCL/kernel param opt" /// This struct is a record of the device binary information. If the Kind field /// denotes a portable binary type (SPIR-V or LLVM IR), the DeviceTargetSpec diff --git a/sycl/include/CL/sycl/detail/pi.hpp b/sycl/include/CL/sycl/detail/pi.hpp index c106c5cba35bd..f2e3cd4662cdc 100644 --- a/sycl/include/CL/sycl/detail/pi.hpp +++ b/sycl/include/CL/sycl/detail/pi.hpp @@ -22,6 +22,7 @@ #include #include #include +#include #ifdef XPTI_ENABLE_INSTRUMENTATION // Forward declarations @@ -204,6 +205,7 @@ class DeviceBinaryProperty { : Prop(Prop) {} pi_uint32 asUint32() const; + std::vector asByteArray() const; const char *asCString() const; protected: @@ -300,6 +302,9 @@ class DeviceBinaryImage { /// value is 32-bit unsigned integer ID. const PropertyRange &getSpecConstants() const { return SpecConstIDMap; } const PropertyRange &getDeviceLibReqMask() const { return DeviceLibReqMask; } + const PropertyRange &getKernelParamOptInfo() const { + return KernelParamOptInfo; + } virtual ~DeviceBinaryImage() {} protected: @@ -310,6 +315,7 @@ class DeviceBinaryImage { pi::PiDeviceBinaryType Format = PI_DEVICE_BINARY_TYPE_NONE; DeviceBinaryImage::PropertyRange SpecConstIDMap; DeviceBinaryImage::PropertyRange DeviceLibReqMask; + DeviceBinaryImage::PropertyRange KernelParamOptInfo; }; /// Tries to determine the device binary image foramat. Returns diff --git a/sycl/source/detail/pi.cpp b/sycl/source/detail/pi.cpp index 68b9f3cf59a27..880762d0d0a45 100644 --- a/sycl/source/detail/pi.cpp +++ b/sycl/source/detail/pi.cpp @@ -416,6 +416,9 @@ std::ostream &operator<<(std::ostream &Out, const DeviceBinaryProperty &P) { case PI_PROPERTY_TYPE_UINT32: Out << "[UINT32] "; break; + case PI_PROPERTY_TYPE_BYTE_ARRAY: + Out << "[Byte array] "; + break; case PI_PROPERTY_TYPE_STRING: Out << "[String] "; break; @@ -425,16 +428,20 @@ std::ostream &operator<<(std::ostream &Out, const DeviceBinaryProperty &P) { } Out << P.Prop->Name << "="; - switch (P.Prop->Type) { - case PI_PROPERTY_TYPE_UINT32: + if (P.Prop->Type == PI_PROPERTY_TYPE_UINT32) { Out << P.asUint32(); - break; - case PI_PROPERTY_TYPE_STRING: + } else if (P.Prop->Type == PI_PROPERTY_TYPE_BYTE_ARRAY) { + std::vector ByteArray = P.asByteArray(); + std::ios_base::fmtflags FlagsBackup = Out.flags(); + Out << std::hex; + for (auto Byte : ByteArray) { + Out << "0x" << static_cast(Byte) << " "; + } + Out.flags(FlagsBackup); + } else if (P.Prop->Type == PI_PROPERTY_TYPE_STRING) { Out << P.asCString(); - break; - default: - assert("unsupported property"); - return Out; + } else { + assert(false && "Unsupported property"); } return Out; } @@ -491,6 +498,13 @@ pi_uint32 DeviceBinaryProperty::asUint32() const { return sycl::detail::pi::asUint32(&Prop->ValSize); } +std::vector DeviceBinaryProperty::asByteArray() const { + assert(Prop->Type == PI_PROPERTY_TYPE_BYTE_ARRAY && "property type mismatch"); + assert(Prop->ValSize > 0 && "property size mismatch"); + const auto *Data = pi::cast(Prop->ValAddr); + return {Data, Data + Prop->ValSize}; +} + const char *DeviceBinaryProperty::asCString() const { assert(Prop->Type == PI_PROPERTY_TYPE_STRING && "property type mismatch"); assert(Prop->ValSize > 0 && "property size mismatch"); @@ -550,6 +564,7 @@ void DeviceBinaryImage::init(pi_device_binary Bin) { SpecConstIDMap.init(Bin, PI_PROPERTY_SET_SPEC_CONST_MAP); DeviceLibReqMask.init(Bin, PI_PROPERTY_SET_DEVICELIB_REQ_MASK); + KernelParamOptInfo.init(Bin, PI_PROPERTY_SET_KERNEL_PARAM_OPT_INFO); } } // namespace pi diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index 89a3485237474..2a76e45fff393 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -397,6 +397,10 @@ RT::PiProgram ProgramManager::getBuiltPIProgram(OSModuleHandle M, Img.getLinkOptions(), PiDevices, ContextImpl->getCachedLibPrograms(), DeviceLibReqMask); + { + std::lock_guard Lock(MNativeProgramsMutex); + NativePrograms[BuiltProgram.get()] = &Img; + } return BuiltProgram.release(); }; @@ -851,6 +855,19 @@ ProgramManager::build(ProgramPtr Program, const ContextImplPtr Context, return Program; } +static ProgramManager::KernelArgMask +createKernelArgMask(const std::vector &Bytes) { + int SizeInBits = Bytes[0]; + + ProgramManager::KernelArgMask Result; + for (int I = SizeInBits - 1; I >= 0; --I) { + unsigned char Byte = Bytes[Bytes.size() - 1 - (I / CHAR_BIT)]; + Result.push_back(Byte & (1 << (I % CHAR_BIT))); + } + + return Result; +} + void ProgramManager::addImages(pi_device_binaries DeviceBinary) { std::lock_guard Guard(Sync::getGlobalLock()); @@ -860,6 +877,17 @@ void ProgramManager::addImages(pi_device_binaries DeviceBinary) { const _pi_offload_entry EntriesB = RawImg->EntriesBegin; const _pi_offload_entry EntriesE = RawImg->EntriesEnd; auto Img = make_unique_ptr(RawImg, M); + + // Fill the kernel argument mask map + const pi::DeviceBinaryImage::PropertyRange &KPOIRange = + Img->getKernelParamOptInfo(); + if (KPOIRange.isAvailable()) { + KernelNameToArgMaskMap &ArgMaskMap = + m_EliminatedKernelArgMasks[Img.get()]; + for (const auto &Info : KPOIRange) + ArgMaskMap[Info->Name] = + createKernelArgMask(pi::DeviceBinaryProperty(Info).asByteArray()); + } // Use the entry information if it's available if (EntriesB != EntriesE) { // The kernel sets for any pair of images are either disjoint or @@ -1018,6 +1046,52 @@ uint32_t ProgramManager::getDeviceLibReqMask(const RTDeviceBinaryImage &Img) { return 0xFFFFFFFF; } +ProgramManager::KernelArgMask ProgramManager::getEliminatedKernelArgMask( + OSModuleHandle M, const context &Context, pi::PiProgram NativePrg, + const string_class &KernelName, bool KnownProgram) { + if (m_UseSpvFile && M == OSUtil::ExeModuleHandle) + return {}; + + { + std::lock_guard Lock(MNativeProgramsMutex); + auto ImgIt = NativePrograms.find(NativePrg); + if (ImgIt != NativePrograms.end()) { + auto MapIt = m_EliminatedKernelArgMasks.find(ImgIt->second); + if (MapIt != m_EliminatedKernelArgMasks.end()) + return MapIt->second[KernelName]; + return {}; + } + } + + if (KnownProgram) + throw runtime_error("Program is not associated with a binary image", + PI_INVALID_VALUE); + + // If not sure whether the program was built with one of the images, try + // finding the binary. + // TODO this can backfire in some extreme edge cases where there's a kernel + // name collision between our binaries and user-created native programs. + KernelSetId KSId; + try { + KSId = getKernelSetId(M, KernelName); + } catch (sycl::runtime_error &e) { + // If the kernel name wasn't found, assume that the program wasn't created + // from one of our device binary images. + if (e.get_cl_code() == PI_INVALID_KERNEL_NAME) + return {}; + std::rethrow_exception(std::current_exception()); + } + RTDeviceBinaryImage &Img = getDeviceImage(M, KSId, Context); + { + std::lock_guard Lock(MNativeProgramsMutex); + NativePrograms[NativePrg] = &Img; + } + auto MapIt = m_EliminatedKernelArgMasks.find(&Img); + if (MapIt != m_EliminatedKernelArgMasks.end()) + return MapIt->second[KernelName]; + return {}; +} + } // namespace detail } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp index 4a7b1045054bb..c64f4dbd181e7 100644 --- a/sycl/source/detail/program_manager/program_manager.hpp +++ b/sycl/source/detail/program_manager/program_manager.hpp @@ -58,6 +58,8 @@ enum class DeviceLibExt : std::uint32_t { // that is necessary for no interoperability cases with lambda. class ProgramManager { public: + using KernelArgMask = std::vector; + // Returns the single instance of the program manager for the entire // process. Can only be called after staticInit is done. static ProgramManager &getInstance(); @@ -110,6 +112,22 @@ class ProgramManager { const RTDeviceBinaryImage *Img = nullptr); uint32_t getDeviceLibReqMask(const RTDeviceBinaryImage &Img); + /// Returns the mask for eliminated kernel arguments for the requested kernel + /// within the native program. + /// \param M identifies the OS module the kernel comes from (multiple OS + /// modules may have kernels with the same name). + /// \param Context the context associated with the kernel. + /// \param NativePrg the PI program associated with the kernel. + /// \param KernelName the name of the kernel. + /// \param KnownProgram indicates whether the PI program is guaranteed to + /// be known to program manager (built with its API) or not (not + /// cacheable or constructed with interoperability). + KernelArgMask getEliminatedKernelArgMask(OSModuleHandle M, + const context &Context, + pi::PiProgram NativePrg, + const string_class &KernelName, + bool KnownProgram); + private: ProgramManager(); ~ProgramManager() = default; @@ -175,6 +193,8 @@ class ProgramManager { // - knowing which specialization constants are used in the program and // injecting their current values before compiling the SPIRV; the binary // image object has info about all spec constants used in the module + // - finding kernel argument masks for kernels associated with each + // pi_program // NOTE: using RTDeviceBinaryImage raw pointers is OK, since they are not // referenced from outside SYCL runtime and RTDeviceBinaryImage object // lifetime matches program manager's one. @@ -186,6 +206,14 @@ class ProgramManager { /// Protects NativePrograms that can be changed by class' methods. std::mutex MNativeProgramsMutex; + + using KernelNameToArgMaskMap = + std::unordered_map; + /// Maps binary image and kernel name pairs to kernel argument masks which + /// specify which arguments were eliminated during device code optimization. + std::unordered_map + m_EliminatedKernelArgMasks; + /// True iff a SPIRV file has been specified with an environment variable bool m_UseSpvFile = false; }; diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index d5cc7dd8a7da9..b2b8aed693658 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -1647,9 +1647,19 @@ static void ReverseRangeDimensionsForKernel(NDRDescT &NDR) { pi_result ExecCGCommand::SetKernelParamsAndLaunch( CGExecKernel *ExecKernel, RT::PiKernel Kernel, NDRDescT &NDRDesc, - std::vector &RawEvents, RT::PiEvent &Event) { + std::vector &RawEvents, RT::PiEvent &Event, + ProgramManager::KernelArgMask EliminatedArgMask) { + assert(EliminatedArgMask.empty() || + EliminatedArgMask.size() == ExecKernel->MArgs.size()); + vector_class &Args = ExecKernel->MArgs; + std::sort(Args.begin(), Args.end(), [](const ArgDesc &A, const ArgDesc &B) { + return A.MIndex < B.MIndex; + }); + int NextTrueIndex = 0; const detail::plugin &Plugin = MQueue->getPlugin(); for (ArgDesc &Arg : ExecKernel->MArgs) { + if (!EliminatedArgMask.empty() && EliminatedArgMask[Arg.MIndex]) + continue; switch (Arg.MType) { case kernel_param_kind_t::kind_accessor: { Requirement *Req = (Requirement *)(Arg.MPtr); @@ -1658,16 +1668,16 @@ pi_result ExecCGCommand::SetKernelParamsAndLaunch( ? (RT::PiMem)AllocaCmd->ESIMDExt.MWrapperImage : (RT::PiMem)AllocaCmd->getMemAllocation(); if (Plugin.getBackend() == backend::opencl) { - Plugin.call(Kernel, Arg.MIndex, + Plugin.call(Kernel, NextTrueIndex, sizeof(RT::PiMem), &MemArg); } else { - Plugin.call(Kernel, Arg.MIndex, + Plugin.call(Kernel, NextTrueIndex, &MemArg); } break; } case kernel_param_kind_t::kind_std_layout: { - Plugin.call(Kernel, Arg.MIndex, Arg.MSize, + Plugin.call(Kernel, NextTrueIndex, Arg.MSize, Arg.MPtr); break; } @@ -1675,16 +1685,17 @@ pi_result ExecCGCommand::SetKernelParamsAndLaunch( sampler *SamplerPtr = (sampler *)Arg.MPtr; RT::PiSampler Sampler = detail::getSyclObjImpl(*SamplerPtr) ->getOrCreateSampler(MQueue->get_context()); - Plugin.call(Kernel, Arg.MIndex, + Plugin.call(Kernel, NextTrueIndex, &Sampler); break; } case kernel_param_kind_t::kind_pointer: { - Plugin.call(Kernel, Arg.MIndex, + Plugin.call(Kernel, NextTrueIndex, Arg.MSize, Arg.MPtr); break; } } + ++NextTrueIndex; } adjustNDRangePerKernel(NDRDesc, Kernel, @@ -1883,6 +1894,8 @@ cl_int ExecCGCommand::enqueueImp() { sycl::context Context = MQueue->get_context(); RT::PiKernel Kernel = nullptr; std::mutex *KernelMutex = nullptr; + RT::PiProgram Program = nullptr; + bool KnownProgram = true; if (nullptr != ExecKernel->MSyclKernel) { assert(ExecKernel->MSyclKernel->get_info() == @@ -1891,6 +1904,7 @@ cl_int ExecCGCommand::enqueueImp() { auto SyclProg = detail::getSyclObjImpl( ExecKernel->MSyclKernel->get_info()); + Program = SyclProg->getHandleRef(); if (SyclProg->is_cacheable()) { RT::PiKernel FoundKernel = nullptr; std::tie(FoundKernel, KernelMutex) = @@ -1899,23 +1913,35 @@ cl_int ExecCGCommand::enqueueImp() { ExecKernel->MSyclKernel->get_info(), ExecKernel->MKernelName, SyclProg.get()); assert(FoundKernel == Kernel); - } + } else + KnownProgram = false; } else { std::tie(Kernel, KernelMutex) = detail::ProgramManager::getInstance().getOrCreateKernel( ExecKernel->MOSModuleHandle, Context, ExecKernel->MKernelName, nullptr); + MQueue->getPlugin().call( + Kernel, PI_KERNEL_INFO_PROGRAM, sizeof(RT::PiProgram), &Program, + nullptr); } pi_result Error = PI_SUCCESS; + ProgramManager::KernelArgMask EliminatedArgMask; + if (nullptr == ExecKernel->MSyclKernel || + !ExecKernel->MSyclKernel->isCreatedFromSource()) { + EliminatedArgMask = + detail::ProgramManager::getInstance().getEliminatedKernelArgMask( + ExecKernel->MOSModuleHandle, Context, Program, + ExecKernel->MKernelName, KnownProgram); + } if (KernelMutex != nullptr) { // For cacheable kernels, we use per-kernel mutex std::lock_guard Lock(*KernelMutex); Error = SetKernelParamsAndLaunch(ExecKernel, Kernel, NDRDesc, RawEvents, - Event); + Event, EliminatedArgMask); } else { Error = SetKernelParamsAndLaunch(ExecKernel, Kernel, NDRDesc, RawEvents, - Event); + Event, EliminatedArgMask); } if (PI_SUCCESS != Error) { diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index c42d418fa8269..0936de076a6cc 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -19,6 +19,7 @@ #include #include #include +#include __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { @@ -499,10 +500,10 @@ class ExecCGCommand : public Command { AllocaCommandBase *getAllocaForReq(Requirement *Req); - pi_result SetKernelParamsAndLaunch(CGExecKernel *ExecKernel, - RT::PiKernel Kernel, NDRDescT &NDRDesc, - std::vector &RawEvents, - RT::PiEvent &Event); + pi_result SetKernelParamsAndLaunch( + CGExecKernel *ExecKernel, RT::PiKernel Kernel, NDRDescT &NDRDesc, + std::vector &RawEvents, RT::PiEvent &Event, + ProgramManager::KernelArgMask EliminatedArgMask); std::unique_ptr MCommandGroup; From 00a8d671b071d9ad7c6f18b47c48d8cc75b804a9 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Fri, 14 Aug 2020 14:04:37 +0300 Subject: [PATCH 2/5] Add a test + fix mask construction Signed-off-by: Sergey Semenov --- .../program_manager/program_manager.cpp | 10 ++- .../program_manager/program_manager.hpp | 1 + sycl/test/device_code_dae/device_code_dae.cpp | 71 +++++++++++++++++++ 3 files changed, 79 insertions(+), 3 deletions(-) create mode 100644 sycl/test/device_code_dae/device_code_dae.cpp diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index 2a76e45fff393..645579bbcdd7f 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -857,11 +858,14 @@ ProgramManager::build(ProgramPtr Program, const ContextImplPtr Context, static ProgramManager::KernelArgMask createKernelArgMask(const std::vector &Bytes) { - int SizeInBits = Bytes[0]; + const int NBytesForSize = 8; + std::uint64_t SizeInBits = 0; + for (int I = 0; I < NBytesForSize; ++I) + SizeInBits |= static_cast(Bytes[I]) << I * CHAR_BIT; ProgramManager::KernelArgMask Result; - for (int I = SizeInBits - 1; I >= 0; --I) { - unsigned char Byte = Bytes[Bytes.size() - 1 - (I / CHAR_BIT)]; + for (std::uint64_t I = 0; I < SizeInBits; ++I) { + unsigned char Byte = Bytes[NBytesForSize + (I / CHAR_BIT)]; Result.push_back(Byte & (1 << (I % CHAR_BIT))); } diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp index c64f4dbd181e7..1e4329b3fd925 100644 --- a/sycl/source/detail/program_manager/program_manager.hpp +++ b/sycl/source/detail/program_manager/program_manager.hpp @@ -58,6 +58,7 @@ enum class DeviceLibExt : std::uint32_t { // that is necessary for no interoperability cases with lambda. class ProgramManager { public: + // TODO use a custom dynamic bitset instead to make initialization simpler. using KernelArgMask = std::vector; // Returns the single instance of the program manager for the entire diff --git a/sycl/test/device_code_dae/device_code_dae.cpp b/sycl/test/device_code_dae/device_code_dae.cpp new file mode 100644 index 0000000000000..a413a710835ce --- /dev/null +++ b/sycl/test/device_code_dae/device_code_dae.cpp @@ -0,0 +1,71 @@ +// NOTE A temporary test before this compilation flow is enabled by default in +// driver +// RUN: %clangxx -fsycl-device-only -Xclang -fenable-sycl-dae -Xclang -fsycl-int-header=int_header.h %s -c -o device_code.bc -I %sycl_include -Wno-sycl-strict +// RUN: %clangxx -include int_header.h -g -c %s -o host_code.o -I %sycl_include -Wno-sycl-strict +// RUN: llvm-link -o=linked_device_code.bc device_code.bc +// RUN: sycl-post-link -emit-param-info linked_device_code.bc +// RUN: llvm-spirv -o linked_device_code.spv linked_device_code.bc +// RUN: echo -e -n "[Code|Properties]\nlinked_device_code.spv|linked_device_code_0.prop" > table.txt +// RUN: clang-offload-wrapper -o wrapper.bc -host=x86_64 -kind=sycl -target=spir64 -batch table.txt +// RUN: %clangxx -c wrapper.bc -o wrapper.o +// RUN: %clangxx wrapper.o host_code.o -o app.exe -lsycl +// RUN: env SYCL_BE=%sycl_be ./app.exe + +//==---------device_code_dae.cpp - dead argument elimination test ----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#include + +class KernelNameA; +class KernelNameB; +class KernelNameC; +using namespace cl::sycl; + +void verifyAndReset(buffer buf, int expected) { + auto acc = buf.get_access(); + assert(acc[0] == expected); + acc[0] = 0; +} + +int main() { + buffer buf{range<1>(1)}; + int gold = 42; + queue q; + + q.submit([&](handler &cgh) { + auto acc = buf.get_access(cgh); + cgh.single_task([=]() { acc[0] = gold; }); + }); + + verifyAndReset(buf, gold); + + // Check usage of program class + program prgB{q.get_context()}; + prgB.build_with_kernel_type(); + kernel krnB = prgB.get_kernel(); + q.submit([&](handler &cgh) { + auto acc = buf.get_access(cgh); + cgh.single_task(krnB, [=]() { acc[0] = gold; }); + }); + + verifyAndReset(buf, gold); + + // Check the non-cacheable case + program prgC{q.get_context()}; + prgC.compile_with_kernel_type(); + prgC.link(); + kernel krnC = prgC.get_kernel(); + q.submit([&](handler &cgh) { + auto acc = buf.get_access(cgh); + cgh.single_task(krnC, [=]() { acc[0] = gold; }); + }); + + verifyAndReset(buf, gold); +} From ba54ab3d148d04acf012ab441a59de98cc5f53ac Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Mon, 17 Aug 2020 11:55:45 +0300 Subject: [PATCH 3/5] Address review comments + fix a kernel_and_program_interop test case * replace unsigned char with std::uint8_t * replace the use of vector with a custom wrapper to avoid unnecessary copying * remove inappropriate use of CHAR_BIT * add some TODO's for potential improvements * fix a test case by allowing unset arguments (as long as they are set on the user side) Signed-off-by: Sergey Semenov --- sycl/include/CL/sycl/detail/pi.hpp | 19 +++++++++++++- sycl/source/detail/pi.cpp | 26 ++++++++++++------- .../program_manager/program_manager.cpp | 12 ++++++--- sycl/source/detail/scheduler/commands.cpp | 13 ++++++++-- 4 files changed, 53 insertions(+), 17 deletions(-) diff --git a/sycl/include/CL/sycl/detail/pi.hpp b/sycl/include/CL/sycl/detail/pi.hpp index f2e3cd4662cdc..da3864a31c178 100644 --- a/sycl/include/CL/sycl/detail/pi.hpp +++ b/sycl/include/CL/sycl/detail/pi.hpp @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -198,6 +199,22 @@ void printArgs(Arg0 arg0, Args... args) { pi::printArgs(std::forward(args)...); } +// A wrapper for passing around byte array properties +class ByteArray { +public: + using ConstIterator = const std::uint8_t *; + + ByteArray(const std::uint8_t *Ptr, std::size_t Size) : Ptr{Ptr}, Size{Size} {} + const std::uint8_t &operator[](std::size_t Idx) const { return Ptr[Idx]; } + std::size_t size() const { return Size; } + ConstIterator begin() const { return Ptr; } + ConstIterator end() const { return Ptr + Size; } + +private: + const std::uint8_t *Ptr; + const std::size_t Size; +}; + // C++ wrapper over the _pi_device_binary_property_struct structure. class DeviceBinaryProperty { public: @@ -205,7 +222,7 @@ class DeviceBinaryProperty { : Prop(Prop) {} pi_uint32 asUint32() const; - std::vector asByteArray() const; + ByteArray asByteArray() const; const char *asCString() const; protected: diff --git a/sycl/source/detail/pi.cpp b/sycl/source/detail/pi.cpp index 880762d0d0a45..578728e389ee8 100644 --- a/sycl/source/detail/pi.cpp +++ b/sycl/source/detail/pi.cpp @@ -428,20 +428,26 @@ std::ostream &operator<<(std::ostream &Out, const DeviceBinaryProperty &P) { } Out << P.Prop->Name << "="; - if (P.Prop->Type == PI_PROPERTY_TYPE_UINT32) { + switch (P.Prop->Type) { + case PI_PROPERTY_TYPE_UINT32: Out << P.asUint32(); - } else if (P.Prop->Type == PI_PROPERTY_TYPE_BYTE_ARRAY) { - std::vector ByteArray = P.asByteArray(); + break; + case PI_PROPERTY_TYPE_BYTE_ARRAY: { + ByteArray BA = P.asByteArray(); std::ios_base::fmtflags FlagsBackup = Out.flags(); Out << std::hex; - for (auto Byte : ByteArray) { - Out << "0x" << static_cast(Byte) << " "; + for (const auto &Byte : BA) { + Out << "0x" << Byte << " "; } Out.flags(FlagsBackup); - } else if (P.Prop->Type == PI_PROPERTY_TYPE_STRING) { + break; + } + case PI_PROPERTY_TYPE_STRING: Out << P.asCString(); - } else { + break; + default: assert(false && "Unsupported property"); + return Out; } return Out; } @@ -498,11 +504,11 @@ pi_uint32 DeviceBinaryProperty::asUint32() const { return sycl::detail::pi::asUint32(&Prop->ValSize); } -std::vector DeviceBinaryProperty::asByteArray() const { +ByteArray DeviceBinaryProperty::asByteArray() const { assert(Prop->Type == PI_PROPERTY_TYPE_BYTE_ARRAY && "property type mismatch"); assert(Prop->ValSize > 0 && "property size mismatch"); - const auto *Data = pi::cast(Prop->ValAddr); - return {Data, Data + Prop->ValSize}; + const auto *Data = pi::cast(Prop->ValAddr); + return {Data, Prop->ValSize}; } const char *DeviceBinaryProperty::asCString() const { diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index 645579bbcdd7f..fc45b310c94c6 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -857,16 +857,17 @@ ProgramManager::build(ProgramPtr Program, const ContextImplPtr Context, } static ProgramManager::KernelArgMask -createKernelArgMask(const std::vector &Bytes) { +createKernelArgMask(const pi::ByteArray &Bytes) { const int NBytesForSize = 8; + const int NBitsInElement = 8; std::uint64_t SizeInBits = 0; for (int I = 0; I < NBytesForSize; ++I) - SizeInBits |= static_cast(Bytes[I]) << I * CHAR_BIT; + SizeInBits |= static_cast(Bytes[I]) << I * NBitsInElement; ProgramManager::KernelArgMask Result; for (std::uint64_t I = 0; I < SizeInBits; ++I) { - unsigned char Byte = Bytes[NBytesForSize + (I / CHAR_BIT)]; - Result.push_back(Byte & (1 << (I % CHAR_BIT))); + std::uint8_t Byte = Bytes[NBytesForSize + (I / NBitsInElement)]; + Result.push_back(Byte & (1 << (I % NBitsInElement))); } return Result; @@ -1050,9 +1051,12 @@ uint32_t ProgramManager::getDeviceLibReqMask(const RTDeviceBinaryImage &Img) { return 0xFFFFFFFF; } +// TODO consider another approach with storing the masks in the integration +// header instead. ProgramManager::KernelArgMask ProgramManager::getEliminatedKernelArgMask( OSModuleHandle M, const context &Context, pi::PiProgram NativePrg, const string_class &KernelName, bool KnownProgram) { + // If instructed to use a spv file, assume no eliminated arguments. if (m_UseSpvFile && M == OSUtil::ExeModuleHandle) return {}; diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index b2b8aed693658..9bce619ce542a 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -1649,15 +1649,24 @@ pi_result ExecCGCommand::SetKernelParamsAndLaunch( CGExecKernel *ExecKernel, RT::PiKernel Kernel, NDRDescT &NDRDesc, std::vector &RawEvents, RT::PiEvent &Event, ProgramManager::KernelArgMask EliminatedArgMask) { - assert(EliminatedArgMask.empty() || - EliminatedArgMask.size() == ExecKernel->MArgs.size()); vector_class &Args = ExecKernel->MArgs; + // TODO this is not necessary as long as we can guarantee that the arguments + // are already sorted (e. g. handle the sorting in handler if necessary due + // to set_arg(...) usage). std::sort(Args.begin(), Args.end(), [](const ArgDesc &A, const ArgDesc &B) { return A.MIndex < B.MIndex; }); + int LastIndex = -1; int NextTrueIndex = 0; const detail::plugin &Plugin = MQueue->getPlugin(); for (ArgDesc &Arg : ExecKernel->MArgs) { + // Handle potential gaps in set arguments (e. g. if some of them are set + // on the user side). + for (int Idx = LastIndex + 1; Idx < Arg.MIndex; ++Idx) + if (EliminatedArgMask.empty() || !EliminatedArgMask[Arg.MIndex]) + ++NextTrueIndex; + LastIndex = Arg.MIndex; + if (!EliminatedArgMask.empty() && EliminatedArgMask[Arg.MIndex]) continue; switch (Arg.MType) { From 8df3fb607550d38988b2742eb60bfb54b3bc87aa Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Mon, 17 Aug 2020 13:01:59 +0300 Subject: [PATCH 4/5] Fix unset argument handling Signed-off-by: Sergey Semenov --- sycl/source/detail/scheduler/commands.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 9bce619ce542a..efbe0373f06e6 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -1663,7 +1663,7 @@ pi_result ExecCGCommand::SetKernelParamsAndLaunch( // Handle potential gaps in set arguments (e. g. if some of them are set // on the user side). for (int Idx = LastIndex + 1; Idx < Arg.MIndex; ++Idx) - if (EliminatedArgMask.empty() || !EliminatedArgMask[Arg.MIndex]) + if (EliminatedArgMask.empty() || !EliminatedArgMask[Idx]) ++NextTrueIndex; LastIndex = Arg.MIndex; From e06e37a8018cbee2cef8331bd92b63f1cf59358e Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Mon, 17 Aug 2020 13:42:23 +0300 Subject: [PATCH 5/5] Disable the added test on CUDA due to SPIR-V usage Signed-off-by: Sergey Semenov --- sycl/test/device_code_dae/device_code_dae.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sycl/test/device_code_dae/device_code_dae.cpp b/sycl/test/device_code_dae/device_code_dae.cpp index a413a710835ce..52eb03d9bb4fc 100644 --- a/sycl/test/device_code_dae/device_code_dae.cpp +++ b/sycl/test/device_code_dae/device_code_dae.cpp @@ -1,5 +1,7 @@ // NOTE A temporary test before this compilation flow is enabled by default in // driver +// UNSUPPORTED: cuda +// CUDA does not support SPIR-V. // RUN: %clangxx -fsycl-device-only -Xclang -fenable-sycl-dae -Xclang -fsycl-int-header=int_header.h %s -c -o device_code.bc -I %sycl_include -Wno-sycl-strict // RUN: %clangxx -include int_header.h -g -c %s -o host_code.o -I %sycl_include -Wno-sycl-strict // RUN: llvm-link -o=linked_device_code.bc device_code.bc