diff --git a/buildbot/configure.py b/buildbot/configure.py index f3a43857b7e1..5260a8c56336 100644 --- a/buildbot/configure.py +++ b/buildbot/configure.py @@ -23,6 +23,11 @@ def do_configure(args): libclc_amd_target_names = ';amdgcn--;amdgcn--amdhsa' libclc_nvidia_target_names = ';nvptx64--;nvptx64--nvidiacl' + sycl_enable_fusion = "OFF" + if not args.disable_fusion: + llvm_external_projects += ";sycl-fusion" + sycl_enable_fusion = "ON" + if args.llvm_external_projects: llvm_external_projects += ";" + args.llvm_external_projects.replace(",", ";") @@ -32,6 +37,7 @@ def do_configure(args): xpti_dir = os.path.join(abs_src_dir, "xpti") xptifw_dir = os.path.join(abs_src_dir, "xptifw") libdevice_dir = os.path.join(abs_src_dir, "libdevice") + fusion_dir = os.path.join(abs_src_dir, "sycl-fusion") llvm_targets_to_build = args.host_target llvm_enable_projects = 'clang;' + llvm_external_projects libclc_targets_to_build = '' @@ -144,6 +150,7 @@ def do_configure(args): "-DXPTI_SOURCE_DIR={}".format(xpti_dir), "-DLLVM_EXTERNAL_XPTIFW_SOURCE_DIR={}".format(xptifw_dir), "-DLLVM_EXTERNAL_LIBDEVICE_SOURCE_DIR={}".format(libdevice_dir), + "-DLLVM_EXTERNAL_SYCL_FUSION_SOURCE_DIR={}".format(fusion_dir), "-DLLVM_ENABLE_PROJECTS={}".format(llvm_enable_projects), "-DLIBCLC_TARGETS_TO_BUILD={}".format(libclc_targets_to_build), "-DLIBCLC_GENERATE_REMANGLED_VARIANTS={}".format(libclc_gen_remangled_variants), @@ -159,7 +166,8 @@ def do_configure(args): "-DLLVM_ENABLE_LLD={}".format(llvm_enable_lld), "-DXPTI_ENABLE_WERROR={}".format(xpti_enable_werror), "-DSYCL_CLANG_EXTRA_FLAGS={}".format(sycl_clang_extra_flags), - "-DSYCL_ENABLE_PLUGINS={}".format(';'.join(set(sycl_enabled_plugins))) + "-DSYCL_ENABLE_PLUGINS={}".format(';'.join(set(sycl_enabled_plugins))), + "-DSYCL_ENABLE_KERNEL_FUSION={}".format(sycl_enable_fusion) ] if args.l0_headers and args.l0_loader: @@ -238,6 +246,7 @@ def main(): parser.add_argument("--llvm-external-projects", help="Add external projects to build. Add as comma seperated list.") parser.add_argument("--ci-defaults", action="store_true", help="Enable default CI parameters") parser.add_argument("--enable-plugin", action='append', help="Enable SYCL plugin") + parser.add_argument("--disable-fusion", action="store_true", help="Disable the kernel fusion JIT compiler") args = parser.parse_args() print("args:{}".format(args)) diff --git a/sycl-fusion/CMakeLists.txt b/sycl-fusion/CMakeLists.txt index 8be941097aba..cf1766d5bcda 100644 --- a/sycl-fusion/CMakeLists.txt +++ b/sycl-fusion/CMakeLists.txt @@ -9,6 +9,10 @@ set(SYCL_JIT_BASE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) # directories, similar to how clang/CMakeLists.txt does it. set(LLVM_SPIRV_INCLUDE_DIRS "${LLVM_MAIN_SRC_DIR}/../llvm-spirv/include") -add_subdirectory(jit-compiler) -add_subdirectory(passes) -add_subdirectory(test) +if(WIN32) + message(WARNING "Kernel fusion not yet supported on Windows") +else(WIN32) + add_subdirectory(jit-compiler) + add_subdirectory(passes) + add_subdirectory(test) +endif(WIN32) diff --git a/sycl-fusion/jit-compiler/CMakeLists.txt b/sycl-fusion/jit-compiler/CMakeLists.txt index 1b1951b01d5a..6f087e8801d2 100644 --- a/sycl-fusion/jit-compiler/CMakeLists.txt +++ b/sycl-fusion/jit-compiler/CMakeLists.txt @@ -39,6 +39,15 @@ target_link_libraries(sycl-fusion ${CMAKE_THREAD_LIBS_INIT} ) +if(NOT MSVC AND NOT APPLE) + # Manage symbol visibility through the linker to make sure no LLVM symbols + # are exported and confuse the drivers. + set(linker_script "${CMAKE_CURRENT_SOURCE_DIR}/ld-version-script.txt") + target_link_libraries( + sycl-fusion PRIVATE "-Wl,--version-script=${linker_script}") + set_target_properties(sycl-fusion PROPERTIES LINK_DEPENDS ${linker_script}) +endif() + install(TARGETS sycl-fusion LIBRARY DESTINATION "lib${LLVM_LIBDIR_SUFFIX}" COMPONENT sycl-fusion RUNTIME DESTINATION "bin" COMPONENT sycl-fusion) diff --git a/sycl-fusion/jit-compiler/ld-version-script.txt b/sycl-fusion/jit-compiler/ld-version-script.txt new file mode 100644 index 000000000000..1532381962d4 --- /dev/null +++ b/sycl-fusion/jit-compiler/ld-version-script.txt @@ -0,0 +1,8 @@ +{ + global: + /* Export everything from jit_compiler namespace */ + _ZN12jit_compiler*; + + local: + *; +}; diff --git a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp index 547a7ae64848..36bfbe1f5d79 100644 --- a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp +++ b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp @@ -78,7 +78,7 @@ FusionPipeline::runFusionPasses(Module &Mod, SYCLModuleInfo &InputInfo, FPM.addPass(createFunctionToLoopPassAdaptor(IndVarSimplifyPass{})); LoopUnrollOptions UnrollOptions; FPM.addPass(LoopUnrollPass{UnrollOptions}); - FPM.addPass(SROAPass{}); + FPM.addPass(SROAPass{SROAOptions::ModifyCFG}); // Run the InferAddressSpace pass to remove as many address-space casts // to/from generic address-space as possible, because these hinder // internalization. @@ -94,11 +94,11 @@ FusionPipeline::runFusionPasses(Module &Mod, SYCLModuleInfo &InputInfo, // Run additional optimization passes after completing fusion. { FunctionPassManager FPM; - FPM.addPass(SROAPass{}); + FPM.addPass(SROAPass{SROAOptions::ModifyCFG}); FPM.addPass(SCCPPass{}); FPM.addPass(InstCombinePass{}); FPM.addPass(SimplifyCFGPass{}); - FPM.addPass(SROAPass{}); + FPM.addPass(SROAPass{SROAOptions::ModifyCFG}); FPM.addPass(InstCombinePass{}); FPM.addPass(SimplifyCFGPass{}); FPM.addPass(ADCEPass{}); diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt index fe9050e8f857..3e3923f46fbc 100644 --- a/sycl/CMakeLists.txt +++ b/sycl/CMakeLists.txt @@ -145,7 +145,12 @@ install(DIRECTORY ${OpenCL_INCLUDE_DIR}/CL COMPONENT OpenCL-Headers) # Option to enable online kernel fusion via a JIT compiler -option(SYCL_ENABLE_KERNEL_FUSION "Enable kernel fusion via JIT compiler" OFF) +option(SYCL_ENABLE_KERNEL_FUSION "Enable kernel fusion via JIT compiler" ON) +if(SYCL_ENABLE_KERNEL_FUSION AND WIN32) + message(WARNING "Kernel fusion not yet supported on Windows") + set(SYCL_ENABLE_KERNEL_FUSION OFF CACHE + BOOL "Kernel fusion not yet supported on Windows" FORCE) +endif() # Needed for feature_test.hpp if ("cuda" IN_LIST SYCL_ENABLE_PLUGINS) diff --git a/sycl/cmake/modules/AddSYCLUnitTest.cmake b/sycl/cmake/modules/AddSYCLUnitTest.cmake index 9f5e6dc30a48..ea8135be0f09 100644 --- a/sycl/cmake/modules/AddSYCLUnitTest.cmake +++ b/sycl/cmake/modules/AddSYCLUnitTest.cmake @@ -56,6 +56,11 @@ macro(add_sycl_unittest test_dirname link_variant) OpenCL-Headers ${SYCL_LINK_LIBS} ) + + if(SYCL_ENABLE_KERNEL_FUSION) + target_link_libraries(${test_dirname} PRIVATE sycl-fusion) + endif(SYCL_ENABLE_KERNEL_FUSION) + target_include_directories(${test_dirname} PRIVATE SYSTEM ${sycl_inc_dir} diff --git a/sycl/doc/GetStartedGuide.md b/sycl/doc/GetStartedGuide.md index e48e16045845..e0cb806810d1 100644 --- a/sycl/doc/GetStartedGuide.md +++ b/sycl/doc/GetStartedGuide.md @@ -12,6 +12,7 @@ and a wide range of compute accelerators such as GPU and FPGA. - [Build DPC++ toolchain with support for HIP AMD](#build-dpc-toolchain-with-support-for-hip-amd) - [Build DPC++ toolchain with support for HIP NVIDIA](#build-dpc-toolchain-with-support-for-hip-nvidia) - [Build DPC++ toolchain with support for ESIMD CPU Emulation](#build-dpc-toolchain-with-support-for-esimd-emulator) + - [Build DPC++ toolchain with support for runtime kernel fusion](#build-dpc-toolchain-with-support-for-runtime-kernel-fusion) - [Build Doxygen documentation](#build-doxygen-documentation) - [Deployment](#deployment) - [Use DPC++ toolchain](#use-dpc-toolchain) @@ -298,6 +299,16 @@ Enabling this flag requires following packages installed. Currently, this feature was tested and verified on Ubuntu 20.04 environment. +### Build DPC++ toolchain with support for runtime kernel fusion + +Support for the experimental SYCL extension for user-driven kernel fusion +at runtime is enabled by default. + +To disable support for this feature, follow the instructions for the +Linux DPC++ toolchain, but add the `--disable-fusion` flag. + +Kernel fusion is currently not yet supported on the Windows platform. + ### Build Doxygen documentation Building Doxygen documentation is similar to building the product itself. First, diff --git a/sycl/include/sycl/detail/cg.hpp b/sycl/include/sycl/detail/cg.hpp index 6751dcb54206..b835b8f24968 100644 --- a/sycl/include/sycl/detail/cg.hpp +++ b/sycl/include/sycl/detail/cg.hpp @@ -98,6 +98,10 @@ class CG { CGTYPE getType() { return MType; } + std::vector> &getArgsStorage() { return MArgsStorage; } + + std::vector &getAccStorage() { return MAccStorage; } + virtual ~CG() = default; private: diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt index ecd54a4ced59..9fabf9bc9270 100644 --- a/sycl/source/CMakeLists.txt +++ b/sycl/source/CMakeLists.txt @@ -87,6 +87,13 @@ function(add_sycl_rt_library LIB_NAME LIB_OBJ_NAME) PRIVATE OpenCL-Headers ) + if(SYCL_ENABLE_KERNEL_FUSION) + target_link_libraries(${LIB_NAME} PRIVATE sycl-fusion) + target_link_libraries(${LIB_OBJ_NAME} PRIVATE sycl-fusion) + set_property(GLOBAL APPEND PROPERTY SYCL_TOOLCHAIN_INSTALL_COMPONENTS + sycl-fusion) + endif(SYCL_ENABLE_KERNEL_FUSION) + find_package(Threads REQUIRED) target_link_libraries(${LIB_NAME} @@ -139,6 +146,8 @@ set(SYCL_SOURCES "detail/handler_proxy.cpp" "detail/image_accessor_util.cpp" "detail/image_impl.cpp" + "detail/jit_compiler.cpp" + "detail/jit_device_binaries.cpp" "detail/kernel_impl.cpp" "detail/kernel_program_cache.cpp" "detail/memory_manager.cpp" diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp new file mode 100644 index 000000000000..e1ee8e1c9430 --- /dev/null +++ b/sycl/source/detail/jit_compiler.cpp @@ -0,0 +1,881 @@ +//==--- jit_compiler.cpp - SYCL runtime JIT compiler for kernel fusion -----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include +#if SYCL_EXT_CODEPLAY_KERNEL_FUSION +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace sycl { +__SYCL_INLINE_VER_NAMESPACE(_V1) { +namespace detail { + +jit_compiler::jit_compiler() : MJITContext{new ::jit_compiler::JITContext{}} {} + +jit_compiler::~jit_compiler() = default; + +static ::jit_compiler::BinaryFormat +translateBinaryImageFormat(pi::PiDeviceBinaryType Type) { + switch (Type) { + case PI_DEVICE_BINARY_TYPE_SPIRV: + return ::jit_compiler::BinaryFormat::SPIRV; + case PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE: + return ::jit_compiler::BinaryFormat::LLVM; + default: + throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), + "Format unsupported for JIT compiler"); + } +} + +static ::jit_compiler::ParameterKind +translateArgType(kernel_param_kind_t Kind) { + using PK = ::jit_compiler::ParameterKind; + using kind = kernel_param_kind_t; + switch (Kind) { + case kind::kind_accessor: + return PK::Accessor; + case kind::kind_std_layout: + return PK::StdLayout; + case kind::kind_sampler: + return PK::Sampler; + case kind::kind_pointer: + return PK::Pointer; + case kind::kind_specialization_constants_buffer: + return PK::SpecConstBuffer; + case kind::kind_stream: + return PK::Stream; + case kind::kind_invalid: + return PK::Invalid; + } + return PK::Invalid; +} + +enum class Promotion { None, Private, Local }; + +struct PromotionInformation { + Promotion PromotionTarget; + unsigned KernelIndex; + unsigned ArgIndex; + Requirement *Definition; + NDRDescT NDRange; + size_t LocalSize; + std::vector UsedParams; +}; + +using PromotionMap = std::unordered_map; + +static inline void printPerformanceWarning(const std::string &Message) { + if (detail::SYCLConfig::get() > 0) { + std::cerr << "WARNING: " << Message << "\n"; + } +} + +template Promotion getPromotionTarget(const Obj &obj) { + auto Result = Promotion::None; + if (obj.template has_property< + ext::codeplay::experimental::property::promote_private>()) { + Result = Promotion::Private; + } + if (obj.template has_property< + ext::codeplay::experimental::property::promote_local>()) { + if (Result != Promotion::None) { + throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), + "Two contradicting promotion properties on the " + "same buffer/accessor are not allowed."); + } + Result = Promotion::Local; + } + return Result; +} + +static Promotion getInternalizationInfo(Requirement *Req) { + auto AccPromotion = getPromotionTarget(Req->MPropertyList); + + auto *MemObj = static_cast(Req->MSYCLMemObj); + if (MemObj->getType() != SYCLMemObjI::MemObjType::Buffer) { + // We currently do not support promotion on non-buffer memory objects (e.g., + // images). + return Promotion::None; + } + Promotion BuffPromotion = getPromotionTarget(*MemObj); + if (AccPromotion != Promotion::None && BuffPromotion != Promotion::None && + AccPromotion != BuffPromotion) { + throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), + "Contradicting promotion properties on accessor and " + "underlying buffer are not allowed"); + } + return (AccPromotion != Promotion::None) ? AccPromotion : BuffPromotion; +} + +static std::optional getLocalSize(NDRDescT NDRange, Requirement *Req, + Promotion Target) { + auto NumElementsMem = static_cast(Req->MSYCLMemObj)->size(); + if (Target == Promotion::Private) { + auto NumWorkItems = NDRange.GlobalSize.size(); + // For private internalization, the local size is + // (Number of elements in buffer)/(number of work-items) + return NumElementsMem / NumWorkItems; + } else if (Target == Promotion::Local) { + if (NDRange.LocalSize.size() == 0) { + // No work-group size provided, cannot calculate the local size + // and need to bail out. + return {}; + } + auto NumWorkGroups = NDRange.GlobalSize.size() / NDRange.LocalSize.size(); + // For local internalization, the local size is + // (Number of elements in buffer)/(number of work-groups) + return NumElementsMem / NumWorkGroups; + } + return 0; +} + +static bool accessorEquals(Requirement *Req, Requirement *Other) { + return Req->MOffset == Other->MOffset && + Req->MAccessRange == Other->MAccessRange && + Req->MMemoryRange == Other->MMemoryRange && + Req->MSYCLMemObj == Other->MSYCLMemObj && Req->MDims == Other->MDims && + Req->MElemSize == Other->MElemSize && + Req->MOffsetInBytes == Other->MOffsetInBytes && + Req->MIsSubBuffer == Other->MIsSubBuffer; +} + +static void resolveInternalization(ArgDesc &Arg, unsigned KernelIndex, + unsigned ArgFunctionIndex, NDRDescT NDRange, + PromotionMap &Promotions) { + assert(Arg.MType == kernel_param_kind_t::kind_accessor); + + Requirement *Req = static_cast(Arg.MPtr); + + auto ThisPromotionTarget = getInternalizationInfo(Req); + auto ThisLocalSize = getLocalSize(NDRange, Req, ThisPromotionTarget); + + if (Promotions.count(Req->MSYCLMemObj)) { + // We previously encountered an accessor for the same buffer. + auto &PreviousDefinition = Promotions.at(Req->MSYCLMemObj); + + switch (ThisPromotionTarget) { + case Promotion::None: { + if (PreviousDefinition.PromotionTarget != Promotion::None) { + printPerformanceWarning( + "Deactivating previously specified promotion, because this " + "accessor does not specify promotion"); + PreviousDefinition.PromotionTarget = Promotion::None; + } + return; + } + case Promotion::Local: { + if (PreviousDefinition.PromotionTarget == Promotion::None) { + printPerformanceWarning( + "Not performing specified local promotion, due to previous " + "mismatch or because previous accessor specified no promotion"); + return; + } + if (!ThisLocalSize.has_value()) { + printPerformanceWarning("Work-group size for local promotion not " + "specified, not performing internalization"); + PreviousDefinition.PromotionTarget = Promotion::None; + return; + } + if (PreviousDefinition.PromotionTarget == Promotion::Private) { + printPerformanceWarning( + "Overriding previous private promotion with local promotion"); + // Recompute the local size for the previous definition with adapted + // promotion target. + auto NewPrevLocalSize = + getLocalSize(PreviousDefinition.NDRange, + PreviousDefinition.Definition, Promotion::Local); + + if (!NewPrevLocalSize.has_value()) { + printPerformanceWarning( + "Not performing specified local promotion because previous " + "kernels did not specify a local size"); + PreviousDefinition.PromotionTarget = Promotion::None; + return; + } + + PreviousDefinition.LocalSize = NewPrevLocalSize.value(); + PreviousDefinition.PromotionTarget = Promotion::Local; + } + if (PreviousDefinition.LocalSize != ThisLocalSize.value()) { + printPerformanceWarning("Not performing specified local promotion due " + "to work-group size mismatch"); + PreviousDefinition.PromotionTarget = Promotion::None; + return; + } + if (!accessorEquals(Req, PreviousDefinition.Definition)) { + printPerformanceWarning("Not performing specified promotion, due to " + "accessor parameter mismatch"); + PreviousDefinition.PromotionTarget = Promotion::None; + return; + } + return; + } + case Promotion::Private: { + if (PreviousDefinition.PromotionTarget == Promotion::None) { + printPerformanceWarning( + "Not performing specified private promotion, due to previous " + "mismatch or because previous accessor specified no promotion"); + return; + } + + if (PreviousDefinition.PromotionTarget == Promotion::Local) { + // Recompute the local size with adapted promotion target. + auto ThisLocalSize = getLocalSize(NDRange, Req, Promotion::Local); + if (!ThisLocalSize.has_value()) { + printPerformanceWarning("Work-group size for local promotion not " + "specified, not performing internalization"); + PreviousDefinition.PromotionTarget = Promotion::None; + return; + } + + if (PreviousDefinition.LocalSize != ThisLocalSize.value()) { + printPerformanceWarning( + "Not performing specified local promotion due " + "to work-group size mismatch"); + PreviousDefinition.PromotionTarget = Promotion::None; + return; + } + + if (!accessorEquals(Req, PreviousDefinition.Definition)) { + printPerformanceWarning("Not performing local promotion, due to " + "accessor parameter mismatch"); + PreviousDefinition.PromotionTarget = Promotion::None; + return; + } + + printPerformanceWarning( + "Performing local internalization instead, because previous " + "accessor specified local promotion"); + return; + } + + // Previous accessors also specified private promotion. + if (PreviousDefinition.LocalSize != ThisLocalSize.value()) { + printPerformanceWarning( + "Not performing specified private promotion due " + "to work-group size mismatch"); + PreviousDefinition.PromotionTarget = Promotion::None; + return; + } + if (!accessorEquals(Req, PreviousDefinition.Definition)) { + printPerformanceWarning("Not performing specified promotion, due to " + "accessor parameter mismatch"); + PreviousDefinition.PromotionTarget = Promotion::None; + return; + } + return; + } + } + } else { + if (ThisPromotionTarget == Promotion::Local && !ThisLocalSize.has_value()) { + printPerformanceWarning("Work-group size for local promotion not " + "specified, not performing internalization"); + ThisPromotionTarget = Promotion::None; + ThisLocalSize = 0; + } + assert(ThisLocalSize.has_value()); + Promotions.emplace(Req->MSYCLMemObj, + PromotionInformation{ThisPromotionTarget, KernelIndex, + ArgFunctionIndex, Req, NDRange, + ThisLocalSize.value(), + std::vector()}); + } +} + +// Identify a parameter by the argument description, the kernel index and the +// parameter index in that kernel. +struct Param { + ArgDesc Arg; + unsigned KernelIndex; + unsigned ArgIndex; + bool Used; + Param(ArgDesc Argument, unsigned KernelIdx, unsigned ArgIdx, bool InUse) + : Arg{Argument}, KernelIndex{KernelIdx}, ArgIndex{ArgIdx}, Used{InUse} {} +}; + +using ParamList = std::vector; + +using ParamIterator = std::vector::iterator; + +std::vector::const_iterator +detectIdenticalParameter(std::vector &Params, ArgDesc Arg) { + for (auto I = Params.begin(); I < Params.end(); ++I) { + // Two arguments of different type can never be identical. + if (I->Arg.MType == Arg.MType) { + if (Arg.MType == kernel_param_kind_t::kind_pointer || + Arg.MType == kernel_param_kind_t::kind_std_layout) { + // Compare size and, if the size is identical, the content byte-by-byte. + if ((Arg.MSize == I->Arg.MSize) && + std::memcmp(Arg.MPtr, I->Arg.MPtr, Arg.MSize) == 0) { + return I; + } + } else if (Arg.MType == kernel_param_kind_t::kind_accessor) { + Requirement *Req = static_cast(Arg.MPtr); + Requirement *Other = static_cast(I->Arg.MPtr); + if (accessorEquals(Req, Other)) { + return I; + } + } + } + } + return Params.end(); +} + +template >> +F *storePlainArg(std::vector> &ArgStorage, T &&Arg) { + ArgStorage.emplace_back(sizeof(T)); + auto Storage = reinterpret_cast(ArgStorage.back().data()); + *Storage = Arg; + return Storage; +} + +void *storePlainArgRaw(std::vector> &ArgStorage, void *ArgPtr, + size_t ArgSize) { + ArgStorage.emplace_back(ArgSize); + void *Storage = ArgStorage.back().data(); + std::memcpy(Storage, ArgPtr, ArgSize); + return Storage; +} + +static ParamIterator preProcessArguments( + std::vector> &ArgStorage, ParamIterator Arg, + PromotionMap &PromotedAccs, + std::vector<::jit_compiler::ParameterInternalization> &InternalizeParams, + std::vector<::jit_compiler::JITConstant> &JITConstants, + ParamList &NonIdenticalParams, + ::jit_compiler::ParamIdentList &ParamIdentities) { + + // Unused arguments are still in the list at this point (because we + // need them for accessor handling), but there's not pre-processing + // that needs to be done. + if (!Arg->Used) { + return ++Arg; + } + + if (Arg->Arg.MType == kernel_param_kind_t::kind_pointer) { + // Pointer arguments are only stored in the kernel functor object, which + // will go out-of-scope before we execute the fused kernel. Therefore, we + // need to copy the pointer (not the memory it's pointing to) to a permanent + // location and update the argument. + Arg->Arg.MPtr = + storePlainArg(ArgStorage, *static_cast(Arg->Arg.MPtr)); + } + if (Arg->Arg.MType == kernel_param_kind_t::kind_std_layout) { + // Standard layout arguments are only stored in the kernel functor object, + // which will go out-of-scope before we execute the fused kernel. Therefore, + // we need to copy the argument to a permant location and update the + // argument. + Arg->Arg.MPtr = storePlainArgRaw(ArgStorage, Arg->Arg.MPtr, Arg->Arg.MSize); + } + // First check if there's already another parameter with identical + // value. + auto Identical = detectIdenticalParameter(NonIdenticalParams, Arg->Arg); + if (Identical != NonIdenticalParams.end()) { + ::jit_compiler::Parameter ThisParam{Arg->KernelIndex, Arg->ArgIndex}; + ::jit_compiler::Parameter IdenticalParam{Identical->KernelIndex, + Identical->ArgIndex}; + ::jit_compiler::ParameterIdentity Identity{ThisParam, IdenticalParam}; + ParamIdentities.push_back(Identity); + return ++Arg; + } + + if (Arg->Arg.MType == kernel_param_kind_t::kind_accessor) { + // Get local and private promotion information from accessors. + Requirement *Req = static_cast(Arg->Arg.MPtr); + auto &Internalization = PromotedAccs.at(Req->MSYCLMemObj); + auto PromotionTarget = Internalization.PromotionTarget; + if (PromotionTarget == Promotion::Private || + PromotionTarget == Promotion::Local) { + // The accessor should be promoted. + if (Internalization.KernelIndex == Arg->KernelIndex && + Internalization.ArgIndex == Arg->ArgIndex) { + // This is the first accessor for this buffer that should be + // internalized. + InternalizeParams.emplace_back( + ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex}, + (PromotionTarget == Promotion::Private) + ? ::jit_compiler::Internalization::Private + : ::jit_compiler::Internalization::Local, + Internalization.LocalSize); + // If an accessor will be promoted, i.e., if it has the promotion + // property attached to it, the next three arguments, that are + // associated with the accessor (access range, memory range, offset), + // must not participate in identical parameter detection or constant + // propagation, because their values will change if promotion happens. + // Therefore, we can just skip them here, but we need to remember which + // of them are used. + for (unsigned I = 0; I < 4; ++I) { + Internalization.UsedParams.push_back(Arg->Used); + ++Arg; + } + } else { + // We have previously encountered an accessor the same buffer, which + // should be internalized. We can add parameter identities for the + // accessor argument and the next three arguments (range, memory range + // and offset, if they are used). + unsigned Increment = 0; + for (unsigned I = 0; I < 4; ++I) { + // If the argument is used in both cases, i.e., on the original + // accessor to be internalized, and this one, we can insert a + // parameter identity. + if (Arg->Used && Internalization.UsedParams[I]) { + ::jit_compiler::Parameter ThisParam{Arg->KernelIndex, + Arg->ArgIndex}; + ::jit_compiler::Parameter IdenticalParam{ + Internalization.KernelIndex, + Internalization.ArgIndex + Increment}; + ::jit_compiler::ParameterIdentity Identity{ThisParam, + IdenticalParam}; + ParamIdentities.push_back(Identity); + } + if (Internalization.UsedParams[I]) { + ++Increment; + } + ++Arg; + } + } + return Arg; + } else { + // The accessor will not be promoted, so it can participate in identical + // parameter detection. + NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex, + true); + return ++Arg; + } + } else if (Arg->Arg.MType == kernel_param_kind_t::kind_std_layout) { + // No identical parameter exists, so add this to the list. + NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex, + true); + // Propagate values of scalar parameters as constants to the JIT + // compiler. + JITConstants.emplace_back( + ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex}, + Arg->Arg.MPtr, Arg->Arg.MSize); + return ++Arg; + } else if (Arg->Arg.MType == kernel_param_kind_t::kind_pointer) { + // No identical parameter exists, so add this to the list. + NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex, + true); + return ++Arg; + } + return ++Arg; +} + +static void +updatePromotedArgs(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo, + NDRDescT NDRange, std::vector &FusedArgs, + std::vector> &FusedArgStorage) { + auto &ArgUsageInfo = FusedKernelInfo.Args.UsageMask; + assert(ArgUsageInfo.size() == FusedArgs.size()); + for (size_t ArgIndex = 0; ArgIndex < ArgUsageInfo.size();) { + bool PromotedToPrivate = + (ArgUsageInfo[ArgIndex] & ::jit_compiler::ArgUsage::PromotedPrivate); + bool PromotedToLocal = + (ArgUsageInfo[ArgIndex] & ::jit_compiler::ArgUsage::PromotedLocal); + if (PromotedToLocal || PromotedToPrivate) { + // For each internalized accessor, we need to override four arguments + // (see 'addArgsForGlobalAccessor' in handler.cpp for reference), i.e., + // the pointer itself, plus twice the range and the offset. + auto &OldArgDesc = FusedArgs[ArgIndex]; + assert(OldArgDesc.MType == kernel_param_kind_t::kind_accessor); + auto *Req = static_cast(OldArgDesc.MPtr); + + // The stored args are all three-dimensional, but depending on the + // actual number of dimensions of the accessor, only a part of that + // argument is later on passed to the kernel. + const size_t SizeAccField = + sizeof(size_t) * (Req->MDims == 0 ? 1 : Req->MDims); + // Compute the local size and use it for the range parameters. + auto LocalSize = getLocalSize(NDRange, Req, + (PromotedToPrivate) ? Promotion::Private + : Promotion::Local); + range<3> AccessRange{1, 1, LocalSize.value()}; + auto *RangeArg = storePlainArg(FusedArgStorage, AccessRange); + // Use all-zero as the offset + id<3> AcessOffset{0, 0, 0}; + auto *OffsetArg = storePlainArg(FusedArgStorage, AcessOffset); + + // Override the arguments. + // 1. Override the pointer with a std-layout argument with 'nullptr' as + // value. handler.cpp does the same for local accessors. + int SizeInBytes = Req->MElemSize * LocalSize.value(); + FusedArgs[ArgIndex] = + ArgDesc{kernel_param_kind_t::kind_std_layout, nullptr, SizeInBytes, + static_cast(ArgIndex)}; + ++ArgIndex; + // 2. Access Range + FusedArgs[ArgIndex] = + ArgDesc{kernel_param_kind_t::kind_std_layout, RangeArg, + static_cast(SizeAccField), static_cast(ArgIndex)}; + ++ArgIndex; + // 3. Memory Range + FusedArgs[ArgIndex] = + ArgDesc{kernel_param_kind_t::kind_std_layout, RangeArg, + static_cast(SizeAccField), static_cast(ArgIndex)}; + ++ArgIndex; + // 4. Offset + FusedArgs[ArgIndex] = + ArgDesc{kernel_param_kind_t::kind_std_layout, OffsetArg, + static_cast(SizeAccField), static_cast(ArgIndex)}; + ++ArgIndex; + } else { + ++ArgIndex; + } + } +} + +std::unique_ptr +jit_compiler::fuseKernels(QueueImplPtr Queue, + std::vector &InputKernels, + const property_list &PropList) { + // Retrieve the device binary from each of the input + // kernels to hand them over to the JIT compiler. + std::vector<::jit_compiler::SYCLKernelInfo> InputKernelInfo; + std::vector InputKernelNames; + // Collect argument information from all input kernels. + std::vector> ArgsStorage; + std::vector AccStorage; + std::vector Requirements; + std::vector Events; + NDRDescT NDRDesc; + unsigned KernelIndex = 0; + ParamList FusedParams; + PromotionMap PromotedAccs; + // TODO(Lukas, ONNX-399): Collect information about streams and auxiliary + // resources (which contain reductions) and figure out how to fuse them. + for (auto &RawCmd : InputKernels) { + auto *KernelCmd = static_cast(RawCmd); + auto &CG = KernelCmd->getCG(); + assert(CG.getType() == CG::Kernel); + auto *KernelCG = static_cast(&CG); + + auto KernelName = KernelCG->MKernelName; + if (KernelName.empty()) { + printPerformanceWarning( + "Cannot fuse kernel with invalid kernel function name"); + return nullptr; + } + const RTDeviceBinaryImage *DeviceImage = nullptr; + RT::PiProgram Program = nullptr; + if (KernelCG->getKernelBundle() != nullptr) { + // Retrieve the device image from the kernel bundle. + auto KernelBundle = KernelCG->getKernelBundle(); + kernel_id KernelID = + detail::ProgramManager::getInstance().getSYCLKernelID(KernelName); + + auto SyclKernel = detail::getSyclObjImpl( + KernelBundle->get_kernel(KernelID, KernelBundle)); + + DeviceImage = SyclKernel->getDeviceImage()->get_bin_image_ref(); + Program = SyclKernel->getDeviceImage()->get_program_ref(); + } else if (KernelCG->MSyclKernel != nullptr) { + DeviceImage = + KernelCG->MSyclKernel->getDeviceImage()->get_bin_image_ref(); + Program = KernelCG->MSyclKernel->getDeviceImage()->get_program_ref(); + } else { + auto ContextImpl = Queue->getContextImplPtr(); + auto Context = detail::createSyclObjFromImpl(ContextImpl); + auto DeviceImpl = Queue->getDeviceImplPtr(); + auto Device = detail::createSyclObjFromImpl(DeviceImpl); + DeviceImage = &detail::ProgramManager::getInstance().getDeviceImage( + KernelCG->MOSModuleHandle, KernelName, Context, Device); + Program = detail::ProgramManager::getInstance().createPIProgram( + *DeviceImage, Context, Device); + } + if (!DeviceImage || !Program) { + printPerformanceWarning("No suitable IR available for fusion"); + return nullptr; + } + ProgramManager::KernelArgMask EliminatedArgs; + if (Program && (KernelCG->MSyclKernel == nullptr || + !KernelCG->MSyclKernel->isCreatedFromSource())) { + EliminatedArgs = + detail::ProgramManager::getInstance().getEliminatedKernelArgMask( + KernelCG->MOSModuleHandle, Program, KernelName); + } + + // Collect information about the arguments of this kernel. + + // Might need to sort the arguments in case they are not already sorted, + // see also the similar code in commands.cpp. + auto Args = KernelCG->MArgs; + std::sort(Args.begin(), Args.end(), [](const ArgDesc &A, const ArgDesc &B) { + return A.MIndex < B.MIndex; + }); + + ::jit_compiler::SYCLArgumentDescriptor ArgDescriptor; + size_t ArgIndex = 0; + // The kernel function in SPIR-V will only have the non-eliminated + // arguments, so keep track of this "actual" argument index. + unsigned ArgFunctionIndex = 0; + for (auto &Arg : Args) { + ArgDescriptor.Kinds.push_back(translateArgType(Arg.MType)); + // DPC++ internally uses 'true' to indicate that an argument has been + // eliminated, while the JIT compiler uses 'true' to indicate an + // argument is used. Translate this here. + bool Eliminated = !EliminatedArgs.empty() && EliminatedArgs[ArgIndex++]; + ArgDescriptor.UsageMask.emplace_back(!Eliminated); + + // If the argument has not been eliminated, i.e., is still present on + // the kernel function in LLVM-IR/SPIR-V, collect information about the + // argument for performance optimizations in the JIT compiler. + if (!Eliminated) { + if (Arg.MType == kernel_param_kind_t::kind_accessor) { + resolveInternalization(Arg, KernelIndex, ArgFunctionIndex, + KernelCG->MNDRDesc, PromotedAccs); + } + FusedParams.emplace_back(Arg, KernelIndex, ArgFunctionIndex, true); + ++ArgFunctionIndex; + } else { + FusedParams.emplace_back(Arg, KernelIndex, 0, false); + } + } + + // TODO(Lukas, ONNX-399): Check for the correct kernel bundle state of the + // device image? + auto &RawDeviceImage = DeviceImage->getRawData(); + auto DeviceImageSize = static_cast(RawDeviceImage.BinaryEnd - + RawDeviceImage.BinaryStart); + // Set 0 as the number of address bits, because the JIT compiler can set + // this field based on information from SPIR-V/LLVM module's data-layout. + auto BinaryImageFormat = + translateBinaryImageFormat(DeviceImage->getFormat()); + if (BinaryImageFormat == ::jit_compiler::BinaryFormat::INVALID) { + printPerformanceWarning("No suitable IR available for fusion"); + return nullptr; + } + ::jit_compiler::SYCLKernelBinaryInfo BinInfo{ + translateBinaryImageFormat(DeviceImage->getFormat()), 0, + RawDeviceImage.BinaryStart, DeviceImageSize}; + + InputKernelInfo.emplace_back(KernelName, ArgDescriptor, BinInfo); + InputKernelNames.push_back(KernelName); + + // Collect information for the fused kernel + + // TODO(Lukas, ONNX-399): Currently assuming the NDRDesc is identical for + // all input kernels. Actually verify this here or in the graph_builder. + auto &CurrentNDR = KernelCG->MNDRDesc; + if (CurrentNDR.GlobalSize[0] == 0 && CurrentNDR.NumWorkGroups[0] != 0) { + // Some overloads of parallel_for_work_group only specify the number of + // work-groups, so this can be used to identify hierarchical parallel + // kernels, which are not supported by fusion for now. + printPerformanceWarning( + "Cannot fuse kernel with hierarchical parallelism"); + return nullptr; + // Not all overloads of parallel_for_work_group only specify the number of + // work-groups, so the above mechanism might not detect all hierarchical + // parallelism. + // TODO(Lukas, CRD-6): Find a more reliable way to detect hierarchical + // parallelism. + } + if (KernelIndex == 0) { + NDRDesc = CurrentNDR; + } else { + if (CurrentNDR.Dims != NDRDesc.Dims) { + printPerformanceWarning( + "Cannot fuse kernels with different dimensionality"); + return nullptr; + } + if (CurrentNDR.GlobalOffset != NDRDesc.GlobalOffset) { + printPerformanceWarning( + "Cannot fuse kernels with different global offset"); + return nullptr; + } + if (CurrentNDR.GlobalSize != NDRDesc.GlobalSize) { + printPerformanceWarning( + "Cannot fuse kerneles with different global size"); + return nullptr; + } + if (CurrentNDR.LocalSize[0] != 0 && + CurrentNDR.LocalSize != NDRDesc.LocalSize) { + printPerformanceWarning( + "Cannot fuse kernels with different local size"); + return nullptr; + } + } + // We need to copy the storages here. The input CGs might be eliminated + // before the fused kernel gets executed, so we need to copy the storages + // here to make sure the arguments don't die on us before executing the + // fused kernel. + ArgsStorage.insert(ArgsStorage.end(), KernelCG->getArgsStorage().begin(), + KernelCG->getArgsStorage().end()); + AccStorage.insert(AccStorage.end(), KernelCG->getAccStorage().begin(), + KernelCG->getAccStorage().end()); + // TODO(Lukas, ONNX-399): Does the MSharedPtrStorage contain any + // information about actual shared pointers beside the kernel bundle and + // handler impl? If yes, we might need to copy it here. + Requirements.insert(Requirements.end(), KernelCG->MRequirements.begin(), + KernelCG->MRequirements.end()); + Events.insert(Events.end(), KernelCG->MEvents.begin(), + KernelCG->MEvents.end()); + ++KernelIndex; + } + + // Pre-process the arguments, to detect identical parameters or arguments that + // can be constant-propagated by the JIT compiler. + std::vector<::jit_compiler::ParameterInternalization> InternalizeParams; + std::vector<::jit_compiler::JITConstant> JITConstants; + ::jit_compiler::ParamIdentList ParamIdentities; + ParamList NonIdenticalParameters; + for (auto PI = FusedParams.begin(); PI != FusedParams.end();) { + PI = preProcessArguments(ArgsStorage, PI, PromotedAccs, InternalizeParams, + JITConstants, NonIdenticalParameters, + ParamIdentities); + } + + // Retrieve barrier flags. + int BarrierFlags = + (PropList + .has_property()) + ? -1 + : 3; + + static size_t FusedKernelNameIndex = 0; + std::stringstream FusedKernelName; + FusedKernelName << "fused_" << FusedKernelNameIndex++; + ::jit_compiler::Config JITConfig; + bool DebugEnabled = + detail::SYCLConfig::get() > 0; + JITConfig.set<::jit_compiler::option::JITEnableVerbose>(DebugEnabled); + // TODO: Enable caching in a separate PR. + + auto FusionResult = ::jit_compiler::KernelFusion::fuseKernels( + *MJITContext, std::move(JITConfig), InputKernelInfo, InputKernelNames, + FusedKernelName.str(), ParamIdentities, BarrierFlags, InternalizeParams, + JITConstants); + + if (FusionResult.failed()) { + if (DebugEnabled) { + std::cerr + << "ERROR: JIT compilation for kernel fusion failed with message:\n" + << FusionResult.getErrorMessage() << "\n"; + } + return nullptr; + } + + auto &FusedKernelInfo = FusionResult.getKernelInfo(); + + std::vector FusedArgs; + int FusedArgIndex = 0; + for (auto &Param : FusedParams) { + // Add to the argument list of the fused kernel, but with the correct + // new index in the fused kernel. + auto &Arg = Param.Arg; + FusedArgs.emplace_back(Arg.MType, Arg.MPtr, Arg.MSize, FusedArgIndex++); + } + + // Update the kernel arguments for internalized accessors. + updatePromotedArgs(FusedKernelInfo, NDRDesc, FusedArgs, ArgsStorage); + + if (!FusionResult.cached()) { + auto PIDeviceBinaries = createPIDeviceBinary(FusedKernelInfo); + detail::ProgramManager::getInstance().addImages(PIDeviceBinaries); + } else if (DebugEnabled) { + std::cerr << "INFO: Re-using existing device binary for fused kernel\n"; + } + + // Create a kernel bundle for the fused kernel. + // Kernel bundles are stored in the CG as one of the "extended" members. + auto FusedKernelId = detail::ProgramManager::getInstance().getSYCLKernelID( + FusedKernelInfo.Name); + std::vector> RawExtendedMembers; + + std::shared_ptr KernelBundleImplPtr = + detail::getSyclObjImpl(get_kernel_bundle( + Queue->get_context(), {Queue->get_device()}, {FusedKernelId})); + + std::unique_ptr FusedCG; + FusedCG.reset(new detail::CGExecKernel( + NDRDesc, nullptr, nullptr, std::move(KernelBundleImplPtr), + std::move(ArgsStorage), std::move(AccStorage), + std::move(RawExtendedMembers), std::move(Requirements), std::move(Events), + std::move(FusedArgs), FusedKernelInfo.Name, OSUtil::DummyModuleHandle, {}, + {}, CG::CGTYPE::Kernel)); + return FusedCG; +} + +pi_device_binaries jit_compiler::createPIDeviceBinary( + const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo) { + + DeviceBinaryContainer Binary; + + // Create an offload entry for the fused kernel. + // It seems to be OK to set zero for most of the information here, at least + // that is the case for compiled SPIR-V binaries. + OffloadEntryContainer Entry{FusedKernelInfo.Name, nullptr, 0, 0, 0}; + Binary.addOffloadEntry(std::move(Entry)); + + // Create a property entry for the argument usage mask for the fused kernel. + auto ArgMask = encodeArgUsageMask(FusedKernelInfo.Args.UsageMask); + PropertyContainer ArgMaskProp{FusedKernelInfo.Name, ArgMask.data(), + ArgMask.size(), + pi_property_type::PI_PROPERTY_TYPE_BYTE_ARRAY}; + + // Create a property set for the argument usage masks of all kernels + // (currently only one). + PropertySetContainer ArgMaskPropSet{ + __SYCL_PI_PROPERTY_SET_KERNEL_PARAM_OPT_INFO}; + + ArgMaskPropSet.addProperty(std::move(ArgMaskProp)); + + Binary.addProperty(std::move(ArgMaskPropSet)); + + DeviceBinariesCollection Collection; + Collection.addDeviceBinary(std::move(Binary), + FusedKernelInfo.BinaryInfo.BinaryStart, + FusedKernelInfo.BinaryInfo.BinarySize, + FusedKernelInfo.BinaryInfo.AddressBits); + + JITDeviceBinaries.push_back(std::move(Collection)); + return JITDeviceBinaries.back().getPIDeviceStruct(); +} + +std::vector jit_compiler::encodeArgUsageMask( + const ::jit_compiler::ArgUsageMask &Mask) const { + // This must match the decoding logic in program_manager.cpp. + constexpr uint64_t NBytesForSize = 8; + constexpr uint64_t NBitsInElement = 8; + uint64_t Size = static_cast(Mask.size()); + // Round the size to the next multiple of 8 + uint64_t RoundedSize = + ((Size + (NBitsInElement - 1)) & (~(NBitsInElement - 1))); + std::vector Encoded((RoundedSize / NBitsInElement) + NBytesForSize, + 0u); + // First encode the size of the actual mask + for (size_t i = 0; i < NBytesForSize; ++i) { + uint8_t Byte = + static_cast((RoundedSize >> i * NBitsInElement) & 0xFF); + Encoded[i] = Byte; + } + // Encode the actual mask bit-wise + for (size_t i = 0; i < Size; ++i) { + // DPC++ internally uses 'true' to indicate that an argument has been + // eliminated, while the JIT compiler uses 'true' to indicate an argument + // is used. Translate this here. + if (!(Mask[i] & ::jit_compiler::ArgUsage::Used)) { + uint8_t &Byte = Encoded[NBytesForSize + (i / NBitsInElement)]; + Byte |= static_cast((1 << (i % NBitsInElement))); + } + } + return Encoded; +} + +} // namespace detail +} // __SYCL_INLINE_VER_NAMESPACE(_V1) +} // namespace sycl + +#endif // SYCL_EXT_CODEPLAY_KERNEL_FUSION diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp new file mode 100644 index 000000000000..522c0749ef75 --- /dev/null +++ b/sycl/source/detail/jit_compiler.hpp @@ -0,0 +1,62 @@ +//==--- jit_compiler.hpp - SYCL runtime JIT compiler for kernel fusion -----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include + +namespace jit_compiler { +class JITContext; +struct SYCLKernelInfo; +using ArgUsageMask = std::vector; +} // namespace jit_compiler + +struct pi_device_binaries_struct; +struct _pi_offload_entry_struct; + +namespace sycl { +__SYCL_INLINE_VER_NAMESPACE(_V1) { +namespace detail { + +class jit_compiler { + +public: + std::unique_ptr + fuseKernels(QueueImplPtr Queue, std::vector &InputKernels, + const property_list &); + + static jit_compiler &get_instance() { + static jit_compiler instance{}; + return instance; + } + +private: + jit_compiler(); + ~jit_compiler(); + jit_compiler(const jit_compiler &) = delete; + jit_compiler(jit_compiler &&) = delete; + jit_compiler &operator=(const jit_compiler &) = delete; + jit_compiler &operator=(const jit_compiler &&) = delete; + + pi_device_binaries + createPIDeviceBinary(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo); + + std::vector + encodeArgUsageMask(const ::jit_compiler::ArgUsageMask &Mask) const; + + // Manages the lifetime of the PI structs for device binaries. + std::vector JITDeviceBinaries; + + std::unique_ptr<::jit_compiler::JITContext> MJITContext; +}; + +} // namespace detail +} // __SYCL_INLINE_VER_NAMESPACE(_V1) +} // namespace sycl diff --git a/sycl/source/detail/jit_device_binaries.cpp b/sycl/source/detail/jit_device_binaries.cpp new file mode 100644 index 000000000000..0aa778da1424 --- /dev/null +++ b/sycl/source/detail/jit_device_binaries.cpp @@ -0,0 +1,138 @@ +//==- jit_device_binaries.cpp - Runtime construction of PI device binaries -==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +namespace sycl { +__SYCL_INLINE_VER_NAMESPACE(_V1) { +namespace detail { + +OffloadEntryContainer::OffloadEntryContainer(const std::string &Name, + void *Addr, size_t Size, + int32_t Flags, int32_t Reserved) + : KernelName{new char[Name.length() + 1]}, Address{Addr}, EntrySize{Size}, + EntryFlags{Flags}, EntryReserved{Reserved} { + std::memcpy(KernelName.get(), Name.c_str(), Name.length() + 1); +} + +_pi_offload_entry_struct OffloadEntryContainer::getPIOffloadEntry() { + return _pi_offload_entry_struct{Address, KernelName.get(), EntrySize, + EntryFlags, EntryReserved}; +} + +PropertyContainer::PropertyContainer(const std::string &Name, void *Data, + size_t Size, uint32_t Type) + : PropName{new char[Name.length() + 1]}, Value{new unsigned char[Size]}, + ValueSize{Size}, PropType{Type} { + std::memcpy(PropName.get(), Name.c_str(), Name.length() + 1); + std::memcpy(Value.get(), Data, Size); +} + +_pi_device_binary_property_struct PropertyContainer::getPIProperty() { + return _pi_device_binary_property_struct{PropName.get(), Value.get(), + PropType, ValueSize}; +} + +PropertySetContainer::PropertySetContainer(const std::string &Name) + : SetName{new char[Name.length() + 1]} { + std::memcpy(SetName.get(), Name.c_str(), Name.length() + 1); +} + +void PropertySetContainer::addProperty(PropertyContainer &&Prop) { + // Adding to the vectors might trigger reallocation, which would invalidate + // the pointers used for PI structs if a PI struct has already been created + // via getPIPropertySet(). Forbid calls to this method after the first PI + // struct has been created. + assert(Fused && "Adding to container would invalidate existing PI structs"); + PIProperties.push_back(Prop.getPIProperty()); + Properties.push_back(std::move(Prop)); +} + +_pi_device_binary_property_set_struct PropertySetContainer::getPIPropertySet() { + Fused = false; + return _pi_device_binary_property_set_struct{ + const_cast(SetName.get()), PIProperties.data(), + PIProperties.data() + Properties.size()}; +} + +void DeviceBinaryContainer::addOffloadEntry(OffloadEntryContainer &&Cont) { + // Adding to the vectors might trigger reallocation, which would invalidate + // the pointers used for PI structs if a PI struct has already been created + // via getPIDeviceBinary(). Forbid calls to this method after the first PI + // struct has been created. + assert(Fused && "Adding to container would invalidate existing PI structs"); + PIOffloadEntries.push_back(Cont.getPIOffloadEntry()); + OffloadEntries.push_back(std::move(Cont)); +} + +void DeviceBinaryContainer::addProperty(PropertySetContainer &&Cont) { + // Adding to the vectors might trigger reallocation, which would invalidate + // the pointers used for PI structs if a PI struct has already been created + // via getPIDeviceBinary(). Forbid calls to this method after the first PI + // struct has been created. + assert(Fused && "Adding to container would invalidate existing PI structs"); + PIPropertySets.push_back(Cont.getPIPropertySet()); + PropertySets.push_back(std::move(Cont)); +} + +pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary( + const unsigned char *BinaryStart, size_t BinarySize, size_t AddressBits) { + pi_device_binary_struct DeviceBinary; + DeviceBinary.Version = PI_DEVICE_BINARY_VERSION; + DeviceBinary.Kind = PI_DEVICE_BINARY_OFFLOAD_KIND_SYCL; + DeviceBinary.CompileOptions = ""; + DeviceBinary.LinkOptions = ""; + DeviceBinary.ManifestStart = nullptr; + DeviceBinary.ManifestEnd = nullptr; + // It is safe to use these pointers here, as their lifetime is managed by + // the JITContext. + DeviceBinary.BinaryStart = BinaryStart; + DeviceBinary.BinaryEnd = BinaryStart + BinarySize; + DeviceBinary.Format = PI_DEVICE_BINARY_TYPE_SPIRV; + DeviceBinary.DeviceTargetSpec = (AddressBits == 32) + ? __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV32 + : __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64; + DeviceBinary.EntriesBegin = PIOffloadEntries.data(); + DeviceBinary.EntriesEnd = PIOffloadEntries.data() + PIOffloadEntries.size(); + DeviceBinary.PropertySetsBegin = PIPropertySets.data(); + DeviceBinary.PropertySetsEnd = PIPropertySets.data() + PIPropertySets.size(); + Fused = false; + return DeviceBinary; +} + +void DeviceBinariesCollection::addDeviceBinary(DeviceBinaryContainer &&Cont, + const unsigned char *BinaryStart, + size_t BinarySize, + size_t AddressBits) { + // Adding to the vectors might trigger reallocation, which would invalidate + // the pointers used for PI structs if a PI struct has already been created + // via getPIDeviceStruct(). Forbid calls to this method after the first PI + // struct has been created. + assert(Fused && "Adding to container would invalidate existing PI structs"); + PIBinaries.push_back( + Cont.getPIDeviceBinary(BinaryStart, BinarySize, AddressBits)); + Binaries.push_back(std::move(Cont)); +} + +pi_device_binaries DeviceBinariesCollection::getPIDeviceStruct() { + + PIStruct = std::make_unique(); + PIStruct->Version = PI_DEVICE_BINARIES_VERSION; + PIStruct->NumDeviceBinaries = PIBinaries.size(); + PIStruct->DeviceBinaries = PIBinaries.data(); + // According to documentation in pi.h, the HostEntries are not used and + // can therefore be null. + PIStruct->HostEntriesBegin = nullptr; + PIStruct->HostEntriesEnd = nullptr; + Fused = false; + return PIStruct.get(); +} + +} // namespace detail +} // __SYCL_INLINE_VER_NAMESPACE(_V1) +} // namespace sycl diff --git a/sycl/source/detail/jit_device_binaries.hpp b/sycl/source/detail/jit_device_binaries.hpp new file mode 100644 index 000000000000..7bf2c7d9fe07 --- /dev/null +++ b/sycl/source/detail/jit_device_binaries.hpp @@ -0,0 +1,154 @@ +//==- jit_device_binaries.hpp - Runtime construction of PI device binaries -==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include + +namespace sycl { +__SYCL_INLINE_VER_NAMESPACE(_V1) { +namespace detail { + +/// Representation of _pi_offload_entry for creation of JIT device binaries at +/// runtime. +/// Owns the necessary data and provides raw pointers for the PI struct. +class OffloadEntryContainer { +public: + OffloadEntryContainer(const std::string &Name, void *Addr, size_t Size, + int32_t Flags, int32_t Reserved); + + OffloadEntryContainer(OffloadEntryContainer &&) = default; + OffloadEntryContainer &operator=(OffloadEntryContainer &&) = default; + ~OffloadEntryContainer() = default; + // Copying of the container is not allowed. + OffloadEntryContainer(const OffloadEntryContainer &) = delete; + OffloadEntryContainer &operator=(const OffloadEntryContainer &) = delete; + + _pi_offload_entry_struct getPIOffloadEntry(); + +private: + std::unique_ptr KernelName; + + void *Address; + size_t EntrySize; + int32_t EntryFlags; + int32_t EntryReserved; +}; + +/// Representation of _pi_device_binary_property_struct for creation of JIT +/// device binaries at runtime. +/// Owns the necessary data and provides raw pointers for the PI struct. +class PropertyContainer { + +public: + PropertyContainer(const std::string &Name, void *Data, size_t Size, + uint32_t Type); + + PropertyContainer(PropertyContainer &&) = default; + PropertyContainer &operator=(PropertyContainer &&) = default; + ~PropertyContainer() = default; + // Copying of the container is not allowed. + PropertyContainer(const PropertyContainer &) = delete; + PropertyContainer &operator=(const PropertyContainer &) = delete; + + _pi_device_binary_property_struct getPIProperty(); + +private: + std::unique_ptr PropName; + std::unique_ptr Value; + size_t ValueSize; + uint32_t PropType; +}; + +/// Representation of _pi_device_binary_property_set_struct for creation of JIT +/// device binaries at runtime. +/// Owns the necessary data and provides raw pointers for the PI struct. +class PropertySetContainer { +public: + PropertySetContainer(const std::string &Name); + + PropertySetContainer(PropertySetContainer &&) = default; + PropertySetContainer &operator=(PropertySetContainer &&) = default; + ~PropertySetContainer() = default; + // Copying of the container is not allowed, as it would invalidate PI structs. + PropertySetContainer(const PropertySetContainer &) = delete; + PropertySetContainer &operator=(const PropertySetContainer &) = delete; + + void addProperty(PropertyContainer &&Prop); + + _pi_device_binary_property_set_struct getPIPropertySet(); + +private: + std::unique_ptr SetName; + bool Fused = true; + std::vector Properties; + std::vector<_pi_device_binary_property_struct> PIProperties; +}; + +/// Representation of pi_device_binary_struct for creation of JIT device +/// binaries at runtime. +/// Owns the necessary data and provides raw pointers for the PI struct. +class DeviceBinaryContainer { +public: + DeviceBinaryContainer() = default; + DeviceBinaryContainer(DeviceBinaryContainer &&) = default; + DeviceBinaryContainer &operator=(DeviceBinaryContainer &&) = default; + ~DeviceBinaryContainer() = default; + // Copying of the container is not allowed, as it would invalidate PI structs. + DeviceBinaryContainer(const DeviceBinaryContainer &) = delete; + DeviceBinaryContainer &operator=(const DeviceBinaryContainer &) = delete; + + void addOffloadEntry(OffloadEntryContainer &&Cont); + + void addProperty(PropertySetContainer &&Cont); + + pi_device_binary_struct getPIDeviceBinary(const unsigned char *BinaryStart, + size_t BinarySize, + size_t AddressBits); + +private: + bool Fused = true; + std::vector OffloadEntries; + std::vector<_pi_offload_entry_struct> PIOffloadEntries; + std::vector PropertySets; + std::vector<_pi_device_binary_property_set_struct> PIPropertySets; +}; + +/// Representation of pi_device_binaries_struct for creation of JIT device +/// binaries at runtime. +/// Owns the necessary data and provides raw pointers for the PI struct. +class DeviceBinariesCollection { + +public: + DeviceBinariesCollection() = default; + DeviceBinariesCollection(DeviceBinariesCollection &&) = default; + DeviceBinariesCollection &operator=(DeviceBinariesCollection &&) = default; + ~DeviceBinariesCollection() = default; + // Copying of the container is not allowed. + DeviceBinariesCollection(const DeviceBinariesCollection &) = delete; + DeviceBinariesCollection & + operator=(const DeviceBinariesCollection &) = delete; + + void addDeviceBinary(DeviceBinaryContainer &&Cont, + const unsigned char *BinaryStart, size_t BinarySize, + size_t AddressBits); + pi_device_binaries getPIDeviceStruct(); + +private: + bool Fused = true; + std::unique_ptr PIStruct; + + std::vector Binaries; + std::vector PIBinaries; +}; + +} // namespace detail +} // __SYCL_INLINE_VER_NAMESPACE(_V1) +} // namespace sycl diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 9e919aa78614..5f27ca5c0255 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -9,6 +9,10 @@ #include "detail/config.hpp" #include #include +#include +#if SYCL_EXT_CODEPLAY_KERNEL_FUSION +#include +#endif #include #include #include @@ -928,7 +932,7 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, const QueueImplPtr &Queue, std::vector &ToEnqueue) { std::vector &Reqs = CommandGroup->MRequirements; - const std::vector &Events = CommandGroup->MEvents; + std::vector &Events = CommandGroup->MEvents; auto NewCmd = std::make_unique(std::move(CommandGroup), Queue); if (!NewCmd) @@ -941,7 +945,40 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, auto QUniqueID = std::hash()(Queue); if (isInFusionMode(QUniqueID) && !NewCmd->isHostTask()) { auto *FusionCmd = findFusionList(QUniqueID)->second.get(); - FusionCmd->addToFusionList(NewCmd.get()); + + bool dependsOnFusion = false; + for (auto Ev = Events.begin(); Ev != Events.end();) { + auto *EvDepCmd = static_cast((*Ev)->getCommand()); + if (!EvDepCmd) { + continue; + } + // Handle event dependencies on any commands part of another active + // fusion. + if (EvDepCmd->getQueue() != Queue && isPartOfActiveFusion(EvDepCmd)) { + printFusionWarning("Aborting fusion because of event dependency from a " + "different fusion"); + cancelFusion(EvDepCmd->getQueue(), ToEnqueue); + } + // Check if this command depends on the placeholder command for the fusion + // itself participates in. + if (EvDepCmd == FusionCmd) { + Ev = Events.erase(Ev); + dependsOnFusion = true; + } else { + ++Ev; + } + } + + // If this command has an explicit event dependency on the placeholder + // command for this fusion (because it used depends_on on the event returned + // by submitting another kernel to this fusion earlier), add a dependency on + // all the commands in the fusion list so far. + if (dependsOnFusion) { + for (auto *Cmd : FusionCmd->getFusionList()) { + Events.push_back(Cmd->getEvent()); + } + } + // Add the kernel to the graph, but delay the enqueue of any auxiliary // commands (e.g., allocations) resulting from that process by adding them // to the list of auxiliary commands of the fusion command. @@ -964,21 +1001,10 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, cancelFusion(DepCmd->getQueue(), ToEnqueue); } } - // Handle event dependencies on any commands part of another active fusion. - for (auto &Ev : Events) { - auto *EvDepCmd = static_cast(Ev->getCommand()); - if (!EvDepCmd) { - continue; - } - if (EvDepCmd->getQueue() != Queue && isPartOfActiveFusion(EvDepCmd)) { - printFusionWarning("Aborting fusion because of event dependency from a " - "different fusion"); - cancelFusion(EvDepCmd->getQueue(), ToEnqueue); - } - } // Set the fusion command, so we recognize when another command depends on a // kernel in the fusion list. + FusionCmd->addToFusionList(NewCmd.get()); NewCmd->MFusionCmd = FusionCmd; std::vector ToCleanUp; // Add an event dependency from the fusion placeholder command to the new @@ -1427,7 +1453,8 @@ Scheduler::GraphBuilder::completeFusion(QueueImplPtr Queue, // TODO: The logic to invoke the JIT compiler to create a fused kernel from // the list will be added in a later PR. - auto FusedCG = nullptr; + auto FusedCG = detail::jit_compiler::get_instance().fuseKernels( + Queue, CmdList, PropList); if (!FusedCG) { // If the JIT compiler returns a nullptr, JIT compilation of the fused diff --git a/sycl/test/CMakeLists.txt b/sycl/test/CMakeLists.txt index 0fcad85f2060..3257fe7d261e 100644 --- a/sycl/test/CMakeLists.txt +++ b/sycl/test/CMakeLists.txt @@ -120,3 +120,7 @@ if(SYCL_BUILD_PI_HIP) add_dependencies(check-sycl check-sycl-hip) endif() + +if(SYCL_ENABLE_KERNEL_FUSION) + add_dependencies(check-sycl check-sycl-fusion) +endif(SYCL_ENABLE_KERNEL_FUSION)