From e68f8960434b19b23d4dd4ec77554cf420d682f3 Mon Sep 17 00:00:00 2001 From: Julian Oppermann Date: Tue, 8 Jul 2025 09:48:05 +0100 Subject: [PATCH 01/10] Revert "Revert " [SYCL] RTC support for AMD and Nvidia GPU targets (#18918)" (#19304)" This reverts commit 29e7b63c39fc6208af810ca2ab2f49d6ae0d9b90. --- sycl-jit/jit-compiler/CMakeLists.txt | 1 + sycl-jit/jit-compiler/include/RTC.h | 11 +- .../lib/rtc/DeviceCompilation.cpp | 138 +++++++++++++++--- .../jit-compiler/lib/rtc/DeviceCompilation.h | 8 +- sycl-jit/jit-compiler/lib/rtc/RTC.cpp | 22 +-- .../lib/translation/Translation.cpp | 89 +++++------ .../lib/translation/Translation.h | 4 +- sycl/source/detail/device_image_impl.hpp | 42 +++++- sycl/source/detail/jit_compiler.cpp | 75 ++++++++-- sycl/source/detail/jit_compiler.hpp | 3 +- .../kernel_compiler/kernel_compiler_sycl.cpp | 5 +- .../kernel_compiler/kernel_compiler_sycl.hpp | 5 +- sycl/source/kernel_bundle.cpp | 6 +- sycl/test-e2e/KernelCompiler/sycl.cpp | 18 +-- sycl/test-e2e/KernelCompiler/sycl_basic.cpp | 3 +- sycl/test-e2e/KernelCompiler/sycl_cache.cpp | 5 + .../KernelCompiler/sycl_context_error.cpp | 4 +- .../KernelCompiler/sycl_device_globals.cpp | 3 +- .../sycl_export_registration.cpp | 2 + sycl/test-e2e/KernelCompiler/sycl_imf.cpp | 79 ++++++++++ .../KernelCompiler/sycl_include_paths.cpp | 5 +- sycl/test-e2e/KernelCompiler/sycl_join.cpp | 5 +- .../KernelCompiler/sycl_lifetimes.cpp | 8 +- sycl/test-e2e/KernelCompiler/sycl_link.cpp | 2 + .../KernelCompiler/sycl_link_common_dep.cpp | 2 + .../sycl_link_common_dep_optional_feature.cpp | 2 + .../sycl_link_export_conflict.cpp | 2 + .../sycl_link_kernel_conflict.cpp | 2 + .../KernelCompiler/sycl_namespaces.cpp | 3 +- .../test-e2e/KernelCompiler/sycl_overload.cpp | 3 +- .../KernelCompiler/sycl_time_trace.cpp | 4 +- sycl/test-e2e/format.py | 1 + .../no_sycl_hpp_in_e2e_tests.cpp | 2 +- 33 files changed, 418 insertions(+), 146 deletions(-) create mode 100644 sycl/test-e2e/KernelCompiler/sycl_imf.cpp diff --git a/sycl-jit/jit-compiler/CMakeLists.txt b/sycl-jit/jit-compiler/CMakeLists.txt index fabf64546e9d..9e97d2c8ccfd 100644 --- a/sycl-jit/jit-compiler/CMakeLists.txt +++ b/sycl-jit/jit-compiler/CMakeLists.txt @@ -60,6 +60,7 @@ target_include_directories(sycl-jit ${LLVM_MAIN_INCLUDE_DIR} ${LLVM_SPIRV_INCLUDE_DIRS} ${LLVM_EXTERNAL_CLANG_SOURCE_DIR}/include + ${LLVM_EXTERNAL_CLANG_SOURCE_DIR}/lib ${CMAKE_BINARY_DIR}/tools/clang/include ) target_include_directories(sycl-jit diff --git a/sycl-jit/jit-compiler/include/RTC.h b/sycl-jit/jit-compiler/include/RTC.h index a1a39d5b50ad..6a690df45964 100644 --- a/sycl-jit/jit-compiler/include/RTC.h +++ b/sycl-jit/jit-compiler/include/RTC.h @@ -176,10 +176,11 @@ class RTCResult { /// Calculates a BLAKE3 hash of the pre-processed source string described by /// \p SourceFile (considering any additional \p IncludeFiles) and the -/// concatenation of the \p UserArgs. +/// concatenation of the \p UserArgs, for a given \p Format. JIT_EXPORT_SYMBOL RTCHashResult calculateHash(InMemoryFile SourceFile, View IncludeFiles, - View UserArgs); + View UserArgs, + BinaryFormat Format); /// Compiles, links against device libraries, and finalizes the device code in /// the source string described by \p SourceFile, considering any additional \p @@ -191,10 +192,14 @@ JIT_EXPORT_SYMBOL RTCHashResult calculateHash(InMemoryFile SourceFile, /// /// If \p SaveIR is true and \p CachedIR is empty, the LLVM module obtained from /// the frontend invocation is wrapped in bitcode format in the result object. +/// +/// \p BinaryFormat describes the desired format of the compilation - which +/// corresponds to the backend that is being targeted. JIT_EXPORT_SYMBOL RTCResult compileSYCL(InMemoryFile SourceFile, View IncludeFiles, View UserArgs, - View CachedIR, bool SaveIR); + View CachedIR, bool SaveIR, + BinaryFormat Format); /// Requests that the JIT binary referenced by \p Address is deleted from the /// `JITContext`. diff --git a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp index 20891e7615b5..ade0161991b2 100644 --- a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp +++ b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp @@ -8,11 +8,17 @@ #include "DeviceCompilation.h" #include "ESIMD.h" +#include "JITBinaryInfo.h" +#include "translation/Translation.h" +#include +#include +#include #include #include #include #include +#include #include #include #include @@ -313,7 +319,7 @@ class LLVMDiagnosticWrapper : public llvm::DiagnosticHandler { } // anonymous namespace static void adjustArgs(const InputArgList &UserArgList, - const std::string &DPCPPRoot, + const std::string &DPCPPRoot, BinaryFormat Format, SmallVectorImpl &CommandLine) { DerivedArgList DAL{UserArgList}; const auto &OptTable = getDriverOptTable(); @@ -326,6 +332,23 @@ static void adjustArgs(const InputArgList &UserArgList, // unused argument warning. DAL.AddFlagArg(nullptr, OptTable.getOption(OPT_Qunused_arguments)); + if (Format == BinaryFormat::PTX || Format == BinaryFormat::AMDGCN) { + auto [CPU, Features] = + Translator::getTargetCPUAndFeatureAttrs(nullptr, "", Format); + (void)Features; + if (Format == BinaryFormat::AMDGCN) { + DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_fsycl_targets_EQ), + "amdgcn-amd-amdhsa"); + DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_Xsycl_backend_EQ), + "amdgcn-amd-amdhsa"); + DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_offload_arch_EQ), CPU); + } else { + DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_fsycl_targets_EQ), + "nvptx64-nvidia-cuda"); + DAL.AddFlagArg(nullptr, OptTable.getOption(OPT_Xsycl_backend)); + DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_cuda_gpu_arch_EQ), CPU); + } + } ArgStringList ASL; for_each(DAL, [&DAL, &ASL](Arg *A) { A->render(DAL, ASL); }); for_each(UserArgList, @@ -362,10 +385,9 @@ static void setupTool(ClangTool &Tool, const std::string &DPCPPRoot, }); } -Expected -jit_compiler::calculateHash(InMemoryFile SourceFile, - View IncludeFiles, - const InputArgList &UserArgList) { +Expected jit_compiler::calculateHash( + InMemoryFile SourceFile, View IncludeFiles, + const InputArgList &UserArgList, BinaryFormat Format) { TimeTraceScope TTS{"calculateHash"}; const std::string &DPCPPRoot = getDPCPPRoot(); @@ -374,7 +396,7 @@ jit_compiler::calculateHash(InMemoryFile SourceFile, } SmallVector CommandLine; - adjustArgs(UserArgList, DPCPPRoot, CommandLine); + adjustArgs(UserArgList, DPCPPRoot, Format, CommandLine); FixedCompilationDatabase DB{".", CommandLine}; ClangTool Tool{DB, {SourceFile.Path}}; @@ -400,11 +422,10 @@ jit_compiler::calculateHash(InMemoryFile SourceFile, return createStringError("Calculating source hash failed"); } -Expected -jit_compiler::compileDeviceCode(InMemoryFile SourceFile, - View IncludeFiles, - const InputArgList &UserArgList, - std::string &BuildLog, LLVMContext &Context) { +Expected jit_compiler::compileDeviceCode( + InMemoryFile SourceFile, View IncludeFiles, + const InputArgList &UserArgList, std::string &BuildLog, + LLVMContext &Context, BinaryFormat Format) { TimeTraceScope TTS{"compileDeviceCode"}; const std::string &DPCPPRoot = getDPCPPRoot(); @@ -413,7 +434,7 @@ jit_compiler::compileDeviceCode(InMemoryFile SourceFile, } SmallVector CommandLine; - adjustArgs(UserArgList, DPCPPRoot, CommandLine); + adjustArgs(UserArgList, DPCPPRoot, Format, CommandLine); FixedCompilationDatabase DB{".", CommandLine}; ClangTool Tool{DB, {SourceFile.Path}}; @@ -431,12 +452,22 @@ jit_compiler::compileDeviceCode(InMemoryFile SourceFile, return createStringError(BuildLog); } -// This function is a simplified copy of the device library selection process in -// `clang::driver::tools::SYCL::getDeviceLibraries`, assuming a SPIR-V target -// (no AoT, no third-party GPUs, no native CPU). Keep in sync! +// This function is a simplified copy of the device library selection process +// in `clang::driver::tools::SYCL::getDeviceLibraries`, assuming a SPIR-V, or +// GPU targets (no AoT, no native CPU). Keep in sync! static bool getDeviceLibraries(const ArgList &Args, SmallVectorImpl &LibraryList, - DiagnosticsEngine &Diags) { + DiagnosticsEngine &Diags, BinaryFormat Format) { + // For CUDA/HIP we only need devicelib, early exit here. + if (Format == BinaryFormat::PTX) { + LibraryList.push_back( + Args.MakeArgString("devicelib-nvptx64-nvidia-cuda.bc")); + return false; + } else if (Format == BinaryFormat::AMDGCN) { + LibraryList.push_back(Args.MakeArgString("devicelib-amdgcn-amd-amdhsa.bc")); + return false; + } + struct DeviceLibOptInfo { StringRef DeviceLibName; StringRef DeviceLibOption; @@ -541,7 +572,8 @@ static Expected loadBitcodeLibrary(StringRef LibPath, Error jit_compiler::linkDeviceLibraries(llvm::Module &Module, const InputArgList &UserArgList, - std::string &BuildLog) { + std::string &BuildLog, + BinaryFormat Format) { TimeTraceScope TTS{"linkDeviceLibraries"}; const std::string &DPCPPRoot = getDPCPPRoot(); @@ -556,11 +588,29 @@ Error jit_compiler::linkDeviceLibraries(llvm::Module &Module, /* ShouldOwnClient=*/false); SmallVector LibNames; - bool FoundUnknownLib = getDeviceLibraries(UserArgList, LibNames, Diags); + const bool FoundUnknownLib = + getDeviceLibraries(UserArgList, LibNames, Diags, Format); if (FoundUnknownLib) { return createStringError("Could not determine list of device libraries: %s", BuildLog.c_str()); } + const bool IsCudaHIP = + Format == BinaryFormat::PTX || Format == BinaryFormat::AMDGCN; + if (IsCudaHIP) { + // Based on the OS and the format decide on the version of libspirv. + // NOTE: this will be problematic if cross-compiling between OSes. + std::string Libclc{"clc/"}; + Libclc.append( +#ifdef _WIN32 + "remangled-l32-signed_char.libspirv-" +#else + "remangled-l64-signed_char.libspirv-" +#endif + ); + Libclc.append(Format == BinaryFormat::PTX ? "nvptx64-nvidia-cuda.bc" + : "amdgcn-amd-amdhsa.bc"); + LibNames.push_back(Libclc); + } LLVMContext &Context = Module.getContext(); for (const std::string &LibName : LibNames) { @@ -578,6 +628,58 @@ Error jit_compiler::linkDeviceLibraries(llvm::Module &Module, } } + // For GPU targets we need to link against vendor provided libdevice. + if (IsCudaHIP) { + Triple T{Module.getTargetTriple()}; + Driver D{(Twine(DPCPPRoot) + "/bin/clang++").str(), T.getTriple(), Diags}; + auto [CPU, Features] = + Translator::getTargetCPUAndFeatureAttrs(&Module, "", Format); + (void)Features; + // Helper lambda to link modules. + auto LinkInLib = [&](const StringRef LibDevice) -> Error { + ModuleUPtr LibDeviceModule; + if (auto Error = loadBitcodeLibrary(LibDevice, Context) + .moveInto(LibDeviceModule)) { + return Error; + } + if (Linker::linkModules(Module, std::move(LibDeviceModule), + Linker::LinkOnlyNeeded)) { + return createStringError("Unable to link libdevice: %s", + BuildLog.c_str()); + } + return Error::success(); + }; + SmallVector LibDeviceFiles; + if (Format == BinaryFormat::PTX) { + // For NVPTX we can get away with CudaInstallationDetector. + LazyDetector CudaInstallation{D, T, + UserArgList}; + auto LibDevice = CudaInstallation->getLibDeviceFile(CPU); + if (LibDevice.empty()) { + return createStringError("Unable to find Cuda libdevice"); + } + LibDeviceFiles.push_back(LibDevice); + } else { + // AMDGPU requires entire toolchain in order to provide all common bitcode + // libraries. + clang::driver::toolchains::ROCMToolChain TC(D, T, UserArgList); + auto CommonDeviceLibs = TC.getCommonDeviceLibNames( + UserArgList, CPU, Action::OffloadKind::OFK_SYCL, false); + if (CommonDeviceLibs.empty()) { + return createStringError("Unable to find ROCm common device libraries"); + } + for (auto &Lib : CommonDeviceLibs) { + LibDeviceFiles.push_back(Lib.Path); + } + } + for (auto &LibDeviceFile : LibDeviceFiles) { + // llvm::Error converts to false on success. + if (auto Error = LinkInLib(LibDeviceFile)) { + return Error; + } + } + } + return Error::success(); } diff --git a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.h b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.h index 62da2cdb54bf..aa1b19df0cc8 100644 --- a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.h +++ b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.h @@ -8,6 +8,7 @@ #pragma once +#include "JITBinaryInfo.h" #include "RTC.h" #include @@ -24,16 +25,17 @@ using ModuleUPtr = std::unique_ptr; llvm::Expected calculateHash(InMemoryFile SourceFile, View IncludeFiles, - const llvm::opt::InputArgList &UserArgList); + const llvm::opt::InputArgList &UserArgList, BinaryFormat Format); llvm::Expected compileDeviceCode(InMemoryFile SourceFile, View IncludeFiles, const llvm::opt::InputArgList &UserArgList, - std::string &BuildLog, llvm::LLVMContext &Context); + std::string &BuildLog, llvm::LLVMContext &Context, + BinaryFormat Format); llvm::Error linkDeviceLibraries(llvm::Module &Module, const llvm::opt::InputArgList &UserArgList, - std::string &BuildLog); + std::string &BuildLog, BinaryFormat Format); using PostLinkResult = std::pair>; llvm::Expected diff --git a/sycl-jit/jit-compiler/lib/rtc/RTC.cpp b/sycl-jit/jit-compiler/lib/rtc/RTC.cpp index e9307ccb27ee..2e42bca3a49a 100644 --- a/sycl-jit/jit-compiler/lib/rtc/RTC.cpp +++ b/sycl-jit/jit-compiler/lib/rtc/RTC.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "RTC.h" +#include "JITBinaryInfo.h" #include "helper/ErrorHelper.h" #include "rtc/DeviceCompilation.h" #include "translation/SPIRVLLVMTranslation.h" @@ -26,7 +27,8 @@ using namespace jit_compiler; JIT_EXPORT_SYMBOL RTCHashResult calculateHash(InMemoryFile SourceFile, View IncludeFiles, - View UserArgs) { + View UserArgs, + BinaryFormat Format) { llvm::opt::InputArgList UserArgList; if (auto Error = parseUserArgs(UserArgs).moveInto(UserArgList)) { return errorTo(std::move(Error), @@ -36,8 +38,8 @@ JIT_EXPORT_SYMBOL RTCHashResult calculateHash(InMemoryFile SourceFile, auto Start = std::chrono::high_resolution_clock::now(); std::string Hash; - if (auto Error = - calculateHash(SourceFile, IncludeFiles, UserArgList).moveInto(Hash)) { + if (auto Error = calculateHash(SourceFile, IncludeFiles, UserArgList, Format) + .moveInto(Hash)) { return errorTo(std::move(Error), "Hashing failed", /*IsHash=*/false); } @@ -55,7 +57,8 @@ JIT_EXPORT_SYMBOL RTCHashResult calculateHash(InMemoryFile SourceFile, JIT_EXPORT_SYMBOL RTCResult compileSYCL(InMemoryFile SourceFile, View IncludeFiles, View UserArgs, - View CachedIR, bool SaveIR) { + View CachedIR, bool SaveIR, + BinaryFormat Format) { llvm::LLVMContext Context; std::string BuildLog; configureDiagnostics(Context, BuildLog); @@ -104,7 +107,7 @@ JIT_EXPORT_SYMBOL RTCResult compileSYCL(InMemoryFile SourceFile, bool FromSource = !Module; if (FromSource) { if (auto Error = compileDeviceCode(SourceFile, IncludeFiles, UserArgList, - BuildLog, Context) + BuildLog, Context, Format) .moveInto(Module)) { return errorTo(std::move(Error), "Device compilation failed"); } @@ -118,7 +121,8 @@ JIT_EXPORT_SYMBOL RTCResult compileSYCL(InMemoryFile SourceFile, IR = RTCDeviceCodeIR{BCString.data(), BCString.data() + BCString.size()}; } - if (auto Error = linkDeviceLibraries(*Module, UserArgList, BuildLog)) { + if (auto Error = + linkDeviceLibraries(*Module, UserArgList, BuildLog, Format)) { return errorTo(std::move(Error), "Device linking failed"); } @@ -131,9 +135,9 @@ JIT_EXPORT_SYMBOL RTCResult compileSYCL(InMemoryFile SourceFile, for (auto [DevImgInfo, Module] : llvm::zip_equal(BundleInfo.DevImgInfos, Modules)) { - if (auto Error = Translator::translate(*Module, JITContext::getInstance(), - BinaryFormat::SPIRV) - .moveInto(DevImgInfo.BinaryInfo)) { + if (auto Error = + Translator::translate(*Module, JITContext::getInstance(), Format) + .moveInto(DevImgInfo.BinaryInfo)) { return errorTo(std::move(Error), "SPIR-V translation failed"); } } diff --git a/sycl-jit/jit-compiler/lib/translation/Translation.cpp b/sycl-jit/jit-compiler/lib/translation/Translation.cpp index 457b6bfc0e63..99eaffd3384d 100644 --- a/sycl-jit/jit-compiler/lib/translation/Translation.cpp +++ b/sycl-jit/jit-compiler/lib/translation/Translation.cpp @@ -83,9 +83,6 @@ llvm::Expected Translator::translateToPTX(llvm::Module &Mod, LLVMInitializeNVPTXAsmPrinter(); LLVMInitializeNVPTXTargetMC(); - static const char *TARGET_CPU_ATTRIBUTE = "target-cpu"; - static const char *TARGET_FEATURE_ATTRIBUTE = "target-features"; - std::string TargetTriple{"nvptx64-nvidia-cuda"}; std::string ErrorMessage; @@ -99,32 +96,11 @@ llvm::Expected Translator::translateToPTX(llvm::Module &Mod, ErrorMessage.c_str()); } - // Give priority to user specified values (through environment variables: - // SYCL_JIT_AMDGCN_PTX_TARGET_CPU and SYCL_JIT_AMDGCN_PTX_TARGET_FEATURES). - auto CPUVal = ConfigHelper::get(); - auto FeaturesVal = ConfigHelper::get(); - llvm::StringRef CPU = CPUVal.begin(); - llvm::StringRef Features = FeaturesVal.begin(); - - auto *KernelFunc = KernelName ? Mod.getFunction(KernelName) : nullptr; - // If they were not set, use default and consult the module for alternatives - // (if present). - if (CPU.empty()) { - CPU = "sm_50"; - if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) { - CPU = KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString(); - } - } - if (Features.empty()) { - Features = "+sm_50,+ptx76"; - if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) { - Features = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE) - .getValueAsString(); - } - } + auto [CPU, Features] = + getTargetCPUAndFeatureAttrs(&Mod, KernelName, BinaryFormat::PTX); std::unique_ptr TargetMachine(Target->createTargetMachine( - Mod.getTargetTriple(), CPU, Features, {}, llvm::Reloc::PIC_, std::nullopt, + Triple{TargetTriple}, CPU, Features, {}, llvm::Reloc::PIC_, std::nullopt, llvm::CodeGenOptLevel::Default)); llvm::legacy::PassManager PM; @@ -166,9 +142,6 @@ Translator::translateToAMDGCN(llvm::Module &Mod, JITContext &JITCtx, LLVMInitializeAMDGPUAsmPrinter(); LLVMInitializeAMDGPUTargetMC(); - static const char *TARGET_CPU_ATTRIBUTE = "target-cpu"; - static const char *TARGET_FEATURE_ATTRIBUTE = "target-features"; - std::string TargetTriple{"amdgcn-amd-amdhsa"}; std::string ErrorMessage; @@ -181,29 +154,10 @@ Translator::translateToAMDGCN(llvm::Module &Mod, JITContext &JITCtx, "Failed to load and translate AMDGCN LLVM IR module with error %s", ErrorMessage.c_str()); - auto CPUVal = ConfigHelper::get(); - auto FeaturesVal = ConfigHelper::get(); - llvm::StringRef CPU = CPUVal.begin(); - llvm::StringRef Features = FeaturesVal.begin(); - - auto *KernelFunc = KernelName ? Mod.getFunction(KernelName) : nullptr; - if (CPU.empty()) { - // Set to the lowest tested target according to the GetStartedGuide, section - // "Build DPC++ toolchain with support for HIP AMD" - CPU = "gfx906"; - if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) { - CPU = KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString(); - } - } - if (Features.empty()) { - if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) { - Features = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE) - .getValueAsString(); - } - } - + auto [CPU, Features] = + getTargetCPUAndFeatureAttrs(&Mod, KernelName, BinaryFormat::AMDGCN); std::unique_ptr TargetMachine(Target->createTargetMachine( - Mod.getTargetTriple(), CPU, Features, {}, llvm::Reloc::PIC_, std::nullopt, + Triple{TargetTriple}, CPU, Features, {}, llvm::Reloc::PIC_, std::nullopt, llvm::CodeGenOptLevel::Default)); std::string AMDObj; @@ -226,3 +180,34 @@ Translator::translateToAMDGCN(llvm::Module &Mod, JITContext &JITCtx, return &JITCtx.emplaceBinary(std::move(AMDObj), BinaryFormat::AMDGCN); #endif // JIT_SUPPORT_AMDGCN } + +std::pair Translator::getTargetCPUAndFeatureAttrs( + const llvm::Module *M, const char *KernelName, BinaryFormat Format) { + assert((Format == BinaryFormat::AMDGCN || Format == BinaryFormat::PTX) && + "Unexpected format found"); + static const char *TARGET_CPU_ATTRIBUTE = "target-cpu"; + static const char *TARGET_FEATURE_ATTRIBUTE = "target-features"; + // Give priority to user specified values (through environment variables: + // SYCL_JIT_AMDGCN_PTX_TARGET_CPU and SYCL_JIT_AMDGCN_PTX_TARGET_FEATURES). + auto *KernelFunc = (M && KernelName) ? M->getFunction(KernelName) : nullptr; + auto CPUVal = ConfigHelper::get(); + auto FeaturesVal = ConfigHelper::get(); + llvm::StringRef CPU = CPUVal.begin(); + llvm::StringRef Features = FeaturesVal.begin(); + if (CPU.empty()) { + // Set to the lowest tested target according to the GetStartedGuide, section + // "Build DPC++ toolchain with support for HIP AMD" + CPU = Format == BinaryFormat::AMDGCN ? "gfx90a" : "sm_50"; + if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) { + CPU = KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString(); + } + } + if (Features.empty()) { + Features = Format == BinaryFormat::PTX ? "+sm_50,+ptx76" : ""; + if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) { + Features = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE) + .getValueAsString(); + } + } + return std::make_pair(std::string{CPU}, std::string{Features}); +} diff --git a/sycl-jit/jit-compiler/lib/translation/Translation.h b/sycl-jit/jit-compiler/lib/translation/Translation.h index 9c8dfdda5f9f..78bf4ef8ac41 100644 --- a/sycl-jit/jit-compiler/lib/translation/Translation.h +++ b/sycl-jit/jit-compiler/lib/translation/Translation.h @@ -12,7 +12,6 @@ #include "JITContext.h" #include "llvm/IR/Module.h" #include "llvm/Support/Error.h" -#include namespace jit_compiler { @@ -26,6 +25,9 @@ class Translator { translate(llvm::Module &Mod, JITContext &JITCtx, BinaryFormat Format, const char *KernelName = nullptr); + std::pair static getTargetCPUAndFeatureAttrs( + const llvm::Module *M, const char *KernelName, BinaryFormat Format); + private: /// Pair of address and size to represent a binary blob. using BinaryBlob = std::pair; diff --git a/sycl/source/detail/device_image_impl.hpp b/sycl/source/detail/device_image_impl.hpp index 361ec4326c09..60f75e3e97d1 100644 --- a/sycl/source/detail/device_image_impl.hpp +++ b/sycl/source/detail/device_image_impl.hpp @@ -8,6 +8,9 @@ #pragma once +#if SYCL_EXT_JIT_ENABLE +#include "JITBinaryInfo.h" +#endif // SYCL_EXT_JIT_ENABLE #include #include #include @@ -703,6 +706,28 @@ class device_image_impl return MRTCBinInfo && MRTCBinInfo->MLanguage == Lang; } + static ::jit_compiler::BinaryFormat + getTargetFormat([[maybe_unused]] const backend Backend) { +#if SYCL_EXT_JIT_ENABLE + switch (Backend) { + case backend::ext_oneapi_level_zero: + case backend::opencl: + return ::jit_compiler::BinaryFormat::SPIRV; + case backend::ext_oneapi_cuda: + return ::jit_compiler::BinaryFormat::PTX; + case backend::ext_oneapi_hip: + return ::jit_compiler::BinaryFormat::AMDGCN; + default: + throw sycl::exception( + sycl::make_error_code(sycl::errc::invalid), + "Backend does not support kernel_compiler extension"); + } +#else + throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), + "JIT not supported"); +#endif // SYCL_EXT_JIT_ENABLE + } + std::vector> buildFromSource( const std::vector &Devices, const std::vector &BuildOptions, @@ -733,9 +758,12 @@ class device_image_impl } } - if (MRTCBinInfo->MLanguage == syclex::source_language::sycl) + if (MRTCBinInfo->MLanguage == syclex::source_language::sycl) { + const auto Format = getTargetFormat(MContext.get_backend()); return createSYCLImages(Devices, bundle_state::executable, BuildOptions, - LogPtr, RegisteredKernelNames, OutDeviceBins); + LogPtr, RegisteredKernelNames, OutDeviceBins, + Format); + } std::vector DeviceVec; DeviceVec.reserve(Devices.size()); @@ -823,8 +851,10 @@ class device_image_impl "device does not support source language"); } } + const auto Format = getTargetFormat(MContext.get_backend()); return createSYCLImages(Devices, bundle_state::object, CompileOptions, - LogPtr, RegisteredKernelNames, OutDeviceBins); + LogPtr, RegisteredKernelNames, OutDeviceBins, + Format); } private: @@ -996,8 +1026,8 @@ class device_image_impl const std::vector &Options, std::string *LogPtr, const std::vector &RegisteredKernelNames, - std::vector> &OutDeviceBins) - const { + std::vector> &OutDeviceBins, + ::jit_compiler::BinaryFormat Format) const { assert(MRTCBinInfo); assert(MRTCBinInfo->MLanguage == syclex::source_language::sycl); assert(std::holds_alternative(MBinImage)); @@ -1027,7 +1057,7 @@ class device_image_impl auto [Binaries, Prefix] = syclex::detail::SYCL_JIT_Compile( RegisteredKernelNames.empty() ? SourceStr : SourceExt.str(), - MRTCBinInfo->MIncludePairs, Options, LogPtr); + MRTCBinInfo->MIncludePairs, Options, LogPtr, Format); auto &PM = detail::ProgramManager::getInstance(); diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index 189bd3d14530..2884e25399e5 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -263,6 +263,7 @@ sycl_device_binaries jit_compiler::createDeviceBinaries( Binary.addOffloadEntry(std::move(Entry)); } + bool FinalizationTagAdded = false; for (const auto &FPS : DevImgInfo.Properties) { bool IsDeviceGlobalsPropSet = FPS.Name == __SYCL_PROPERTY_SET_SYCL_DEVICE_GLOBALS; @@ -279,18 +280,50 @@ sycl_device_binaries jit_compiler::createDeviceBinaries( sycl_property_type::SYCL_PROPERTY_TYPE_BYTE_ARRAY}); } } + if (FPS.Name == __SYCL_PROPERTY_SET_PROGRAM_METADATA && + DevImgInfo.BinaryInfo.Format == + ::jit_compiler::BinaryFormat::AMDGCN) { + PropSet.addProperty(PropertyContainer{ + __SYCL_PROGRAM_METADATA_TAG_NEED_FINALIZATION, 1}); + FinalizationTagAdded = true; + } Binary.addProperty(std::move(PropSet)); Binary.setCompileOptions(BundleInfo.CompileOptions.c_str()); } - Collection->addDeviceBinary(std::move(Binary), - DevImgInfo.BinaryInfo.BinaryStart, - DevImgInfo.BinaryInfo.BinarySize, - (DevImgInfo.BinaryInfo.AddressBits == 64) - ? __SYCL_DEVICE_BINARY_TARGET_SPIRV64 - : __SYCL_DEVICE_BINARY_TARGET_SPIRV32, - SYCL_DEVICE_BINARY_TYPE_SPIRV); + auto BinaryTarget = ""; + auto Format = SYCL_DEVICE_BINARY_TYPE_NONE; + switch (DevImgInfo.BinaryInfo.Format) { + case ::jit_compiler::BinaryFormat::SPIRV: + BinaryTarget = DevImgInfo.BinaryInfo.AddressBits == 64 + ? __SYCL_DEVICE_BINARY_TARGET_SPIRV64 + : __SYCL_DEVICE_BINARY_TARGET_SPIRV32; + Format = SYCL_DEVICE_BINARY_TYPE_SPIRV; + break; + case ::jit_compiler::BinaryFormat::PTX: + BinaryTarget = __SYCL_DEVICE_BINARY_TARGET_LLVM_NVPTX64; + Format = SYCL_DEVICE_BINARY_TYPE_NONE; + break; + case ::jit_compiler::BinaryFormat::AMDGCN: { + BinaryTarget = __SYCL_DEVICE_BINARY_TARGET_LLVM_AMDGCN; + Format = SYCL_DEVICE_BINARY_TYPE_NONE; + // If the program had no properties, the tag needs to be added now. + if (!FinalizationTagAdded) { + PropertySetContainer ProgramMetadata{ + __SYCL_PROPERTY_SET_PROGRAM_METADATA}; + ProgramMetadata.addProperty(PropertyContainer{ + __SYCL_PROGRAM_METADATA_TAG_NEED_FINALIZATION, 1}); + Binary.addProperty(std::move(ProgramMetadata)); + } + break; + } + default: + assert(false && "Unsupported format"); + }; + Collection->addDeviceBinary( + std::move(Binary), DevImgInfo.BinaryInfo.BinaryStart, + DevImgInfo.BinaryInfo.BinarySize, BinaryTarget, Format); } sycl_device_binaries Binaries = Collection->getPIDeviceStruct(); @@ -311,7 +344,26 @@ void jit_compiler::destroyDeviceBinaries(sycl_device_binaries Binaries) { std::pair jit_compiler::compileSYCL( const std::string &CompilationID, const std::string &SYCLSource, const std::vector> &IncludePairs, - const std::vector &UserArgs, std::string *LogPtr) { + const std::vector &UserArgs, std::string *LogPtr, + ::jit_compiler::BinaryFormat Format) { + if (Format == ::jit_compiler::BinaryFormat::PTX || + Format == ::jit_compiler::BinaryFormat::AMDGCN) { + // If present, set-up the config with env variables describing CPU and + // features. + auto SetUpOption = [](const std::string &Value) { + ::jit_compiler::JITEnvVar Option(Value.begin(), Value.end()); + return Option; + }; + ::jit_compiler::JITEnvVar TargetCPUOpt = SetUpOption( + detail::SYCLConfig::get()); + this->AddToConfigHandle( + ::jit_compiler::option::JITTargetCPU::set(TargetCPUOpt)); + ::jit_compiler::JITEnvVar TargetFeaturesOpt = SetUpOption( + detail::SYCLConfig::get()); + this->AddToConfigHandle( + ::jit_compiler::option::JITTargetFeatures::set(TargetFeaturesOpt)); + } + auto appendToLog = [LogPtr](const char *Msg) { if (LogPtr) { LogPtr->append(Msg); @@ -339,7 +391,7 @@ std::pair jit_compiler::compileSYCL( std::vector CachedIR; if (PersistentDeviceCodeCache::isEnabled()) { auto Result = - CalculateHashHandle(SourceFile, IncludeFilesView, UserArgsView); + CalculateHashHandle(SourceFile, IncludeFilesView, UserArgsView, Format); if (Result.failed()) { appendToLog(Result.getPreprocLog()); @@ -349,8 +401,9 @@ std::pair jit_compiler::compileSYCL( } } - auto Result = CompileSYCLHandle(SourceFile, IncludeFilesView, UserArgsView, - CachedIR, /*SaveIR=*/!CacheKey.empty()); + auto Result = + CompileSYCLHandle(SourceFile, IncludeFilesView, UserArgsView, CachedIR, + /*SaveIR=*/!CacheKey.empty(), Format); const char *BuildLog = Result.getBuildLog(); appendToLog(BuildLog); diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp index 31f87cd2be7e..ac3ebf7d73fa 100644 --- a/sycl/source/detail/jit_compiler.hpp +++ b/sycl/source/detail/jit_compiler.hpp @@ -47,7 +47,8 @@ class jit_compiler { std::pair compileSYCL( const std::string &CompilationID, const std::string &SYCLSource, const std::vector> &IncludePairs, - const std::vector &UserArgs, std::string *LogPtr); + const std::vector &UserArgs, std::string *LogPtr, + ::jit_compiler::BinaryFormat Format); void destroyDeviceBinaries(sycl_device_binaries Binaries); diff --git a/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.cpp b/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.cpp index 93ec94a8ac32..418842fdc48e 100644 --- a/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.cpp +++ b/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.cpp @@ -44,7 +44,8 @@ std::pair SYCL_JIT_Compile( [[maybe_unused]] const std::string &SYCLSource, [[maybe_unused]] const include_pairs_t &IncludePairs, [[maybe_unused]] const std::vector &UserArgs, - [[maybe_unused]] std::string *LogPtr) { + [[maybe_unused]] std::string *LogPtr, + [[maybe_unused]] ::jit_compiler::BinaryFormat Format) { #if SYCL_EXT_JIT_ENABLE static std::atomic_uintptr_t CompilationCounter; std::string CompilationID = "rtc_" + std::to_string(CompilationCounter++); @@ -52,7 +53,7 @@ std::pair SYCL_JIT_Compile( for (const sycl::detail::string_view UserArg : UserArgs) UserArgStrings.push_back(UserArg.data()); return sycl::detail::jit_compiler::get_instance().compileSYCL( - CompilationID, SYCLSource, IncludePairs, UserArgStrings, LogPtr); + CompilationID, SYCLSource, IncludePairs, UserArgStrings, LogPtr, Format); #else throw sycl::exception(sycl::errc::build, "kernel_compiler via sycl-jit is not available"); diff --git a/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.hpp b/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.hpp index c088122a33ff..0dfaf63a1d8d 100644 --- a/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.hpp +++ b/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.hpp @@ -8,6 +8,9 @@ #pragma once +#if SYCL_EXT_JIT_ENABLE +#include "JITBinaryInfo.h" +#endif // SYCL_EXT_JIT_ENABLE #include #include // __SYCL_EXPORT #include @@ -39,7 +42,7 @@ userArgsAsString(const std::vector &UserArguments); std::pair SYCL_JIT_Compile(const std::string &Source, const include_pairs_t &IncludePairs, const std::vector &UserArgs, - std::string *LogPtr); + std::string *LogPtr, ::jit_compiler::BinaryFormat Format); void SYCL_JIT_Destroy(sycl_device_binaries Binaries); diff --git a/sycl/source/kernel_bundle.cpp b/sycl/source/kernel_bundle.cpp index a7e0f9289aac..8fbeb5e98bc5 100644 --- a/sycl/source/kernel_bundle.cpp +++ b/sycl/source/kernel_bundle.cpp @@ -409,8 +409,10 @@ bool is_source_kernel_bundle_supported( const std::vector &DeviceImplVec) { backend BE = DeviceImplVec[0]->getBackend(); // Support is limited to the opencl and level_zero backends. - bool BE_Acceptable = (BE == sycl::backend::ext_oneapi_level_zero) || - (BE == sycl::backend::opencl); + bool BE_Acceptable = + (BE == sycl::backend::ext_oneapi_level_zero) || + (BE == sycl::backend::opencl || BE == sycl::backend::ext_oneapi_hip || + BE == sycl::backend::ext_oneapi_cuda); if (!BE_Acceptable) return false; diff --git a/sycl/test-e2e/KernelCompiler/sycl.cpp b/sycl/test-e2e/KernelCompiler/sycl.cpp index 3095d570be94..13003bea98a6 100644 --- a/sycl/test-e2e/KernelCompiler/sycl.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl.cpp @@ -6,11 +6,10 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_device_allocations // RUN: %{build} -o %t.out -// RUN: %{l0_leak_check} %{run} %t.out +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out // XFAIL: preview-mode && run-mode // XFAIL-TRACKER: https://github.com/intel/llvm/issues/18390 @@ -108,15 +107,14 @@ auto constexpr DeviceLibrariesSource = R"===( #include #include #include -#include extern "C" SYCL_EXTERNAL SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(sycl::ext::oneapi::experimental::single_task_kernel) void device_libs_kernel(float *ptr) { // Extension list: llvm/lib/SYCLLowerIR/SYCLDeviceLibReqMask.cpp - // cl_intel_devicelib_assert is not available for opencl:gpu; skip testing it. - // Only test the fp32 variants of complex, math and imf to keep this test + // cl_intel_devicelib_assert is not available for opencl:gpu; skip testing + // it. Only test the fp32 variants of complex and math to keep this test // device-agnostic. // cl_intel_devicelib_math @@ -128,14 +126,8 @@ void device_libs_kernel(float *ptr) { // cl_intel_devicelib_cstring ptr[2] = memcmp(ptr + 2, ptr + 2, sizeof(float)); - // cl_intel_devicelib_imf - ptr[3] = sycl::ext::intel::math::sqrt(ptr[3] * 2); - - // cl_intel_devicelib_imf_bf16 - ptr[4] = sycl::ext::intel::math::float2bfloat16(ptr[4] * 0.5f); - // cl_intel_devicelib_bfloat16 - ptr[5] = sycl::ext::oneapi::bfloat16{ptr[5] / 0.25f}; + ptr[3] = sycl::ext::oneapi::bfloat16{ptr[3] / 0.25f}; } )==="; @@ -356,7 +348,7 @@ int test_device_libraries(sycl::queue q) { exe_kb kbExe = syclex::build(kbSrc); sycl::kernel k = kbExe.ext_oneapi_get_kernel("device_libs_kernel"); - constexpr size_t nElem = 6; + constexpr size_t nElem = 4; float *ptr = sycl::malloc_shared(nElem, q); for (int i = 0; i < nElem; ++i) ptr[i] = 1.0f; diff --git a/sycl/test-e2e/KernelCompiler/sycl_basic.cpp b/sycl/test-e2e/KernelCompiler/sycl_basic.cpp index 5832d5ff5068..fbb7768fd8ac 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_basic.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_basic.cpp @@ -6,11 +6,10 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_device_allocations // RUN: %{build} -o %t.out -// RUN: %{l0_leak_check} %{run} %t.out +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out #include #include diff --git a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp index 405d97ce15e9..0cffec11ffa4 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp @@ -7,6 +7,11 @@ //===----------------------------------------------------------------------===// // REQUIRES: (opencl || level_zero) +// Unlike other RTC tests, don't run this one on Cuda/HIP. Eviction mechanism +// is based on the size of compiled kernels, which in turns depends on the +// target. Don't run eviction check for CUDA/HIP, so that we don't have to find +// a magic number that works for all binaries (and by definition is flaky). + // REQUIRES: aspect-usm_device_allocations // DEFINE: %{cache_vars} = env SYCL_CACHE_PERSISTENT=1 SYCL_CACHE_TRACE=7 SYCL_CACHE_DIR=%t/cache_dir diff --git a/sycl/test-e2e/KernelCompiler/sycl_context_error.cpp b/sycl/test-e2e/KernelCompiler/sycl_context_error.cpp index e72c733fdbf9..7d41f3a3f79d 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_context_error.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_context_error.cpp @@ -6,10 +6,8 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) - // RUN: %{build} -o %t.out -// RUN: %{run-unfiltered-devices} %t.out +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{run-unfiltered-devices} %t.out #include #include diff --git a/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp b/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp index 2296aba1d950..ba8b08584df8 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp @@ -6,14 +6,13 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_device_allocations // UNSUPPORTED: opencl && gpu // UNSUPPORTED-TRACKER: GSD-4287 // RUN: %{build} -o %t.out -// RUN: %{l0_leak_check} %{run} %t.out +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out #include #include diff --git a/sycl/test-e2e/KernelCompiler/sycl_export_registration.cpp b/sycl/test-e2e/KernelCompiler/sycl_export_registration.cpp index 410f4f3e8d75..4ca1f5296dc7 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_export_registration.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_export_registration.cpp @@ -8,6 +8,8 @@ // REQUIRES: (opencl || level_zero) +// Note linking is not supported on CUDA/HIP. + // -- Test for a case where a kernel bundle is built that exports a symbol and // -- other kernel bundles that uses it are compiled/linked without it. These // -- cases should fail due to unresolved symbols, rather than picking up the diff --git a/sycl/test-e2e/KernelCompiler/sycl_imf.cpp b/sycl/test-e2e/KernelCompiler/sycl_imf.cpp new file mode 100644 index 000000000000..142cc2160ac5 --- /dev/null +++ b/sycl/test-e2e/KernelCompiler/sycl_imf.cpp @@ -0,0 +1,79 @@ +//==--- sycl_imf.cpp --- kernel_compiler extension imf tests ---------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: aspect-usm_device_allocations +// REQUIRES: (opencl || level_zero) + +// UNSUPPORTED: accelerator +// UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there. + +// RUN: %{build} -o %t.out +// RUN: %{l0_leak_check} %{run} %t.out + +// XFAIL: preview-mode && run-mode +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/18390 + +#include +#include +#include + +auto constexpr IMFSources = R"===( +#include +#include +#include + +extern "C" SYCL_EXTERNAL +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(sycl::ext::oneapi::experimental::single_task_kernel) +void imf_kernel(float *ptr) { + // cl_intel_devicelib_imf + ptr[0] = sycl::ext::intel::math::sqrt(ptr[0] * 2); + + // cl_intel_devicelib_imf_bf16 + ptr[1] = sycl::ext::intel::math::float2bfloat16(ptr[1] * 0.5f); +} +)==="; + +int main() { +#ifdef SYCL_EXT_ONEAPI_KERNEL_COMPILER + sycl::queue q; + namespace syclex = sycl::ext::oneapi::experimental; + using source_kb = sycl::kernel_bundle; + using exe_kb = sycl::kernel_bundle; + + sycl::context ctx = q.get_context(); + + source_kb kbSrc = syclex::create_kernel_bundle_from_source( + ctx, syclex::source_language::sycl, IMFSources); + exe_kb kbExe = syclex::build(kbSrc); + + sycl::kernel k = kbExe.ext_oneapi_get_kernel("imf_kernel"); + constexpr size_t nElem = 2; + float *ptr = sycl::malloc_shared(nElem, q); + for (int i = 0; i < nElem; ++i) + ptr[i] = 1.0f; + + q.submit([&](sycl::handler &cgh) { + cgh.set_arg(0, ptr); + cgh.single_task(k); + }); + q.wait_and_throw(); + + // Check that the kernel was executed. Given the {1.0, 1.0} input, + // the expected result is approximately {1.41, 0.5}. + for (unsigned i = 0; i < nElem; ++i) { + std::cout << ptr[i] << ' '; + assert(ptr[i] != 1.0f); + } + std::cout << std::endl; + + sycl::free(ptr, q); +#else + static_assert(false, "Kernel Compiler feature test macro undefined"); +#endif + return 0; +} diff --git a/sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp b/sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp index 0cc50413dbad..d073390a3d6a 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp @@ -6,15 +6,14 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_device_allocations // RUN: %{build} -o %t.out -// RUN: %{run} %t.out %S | FileCheck %s --check-prefixes=CHECK,CHECK-NOCWD +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{run} %t.out %S | FileCheck %s --check-prefixes=CHECK,CHECK-NOCWD // COM: Run test again in a directory that contains a different version of // `header1.hpp` -// RUN: cd %S/include/C ; %{run} %t.out %S | FileCheck %s --check-prefixes=CHECK,CHECK-CWD +// RUN: cd %S/include/C ; %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{run} %t.out %S | FileCheck %s --check-prefixes=CHECK,CHECK-CWD #include #include diff --git a/sycl/test-e2e/KernelCompiler/sycl_join.cpp b/sycl/test-e2e/KernelCompiler/sycl_join.cpp index 33cf81dd6bdc..1cc0c8e8d2d8 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_join.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_join.cpp @@ -6,12 +6,11 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_shared_allocations // RUN: %{build} -o %t.out -// RUN: %{run} %t.out -// RUN: %{l0_leak_check} %{run} %t.out +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{run} %t.out +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out // XFAIL: preview-mode && run-mode // XFAIL-TRACKER: https://github.com/intel/llvm/issues/18390 diff --git a/sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp b/sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp index 94864926af4c..7d38e44cab6c 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp @@ -6,11 +6,10 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_device_allocations // RUN: %{build} -o %t.out -// RUN: env SYCL_UR_TRACE=-1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} env SYCL_UR_TRACE=-1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s #include #include @@ -48,14 +47,15 @@ int test_lifetimes() { ctx, syclex::source_language::sycl, SYCLSource); exe_kb kbExe1 = syclex::build(kbSrc); - // CHECK: urProgramCreateWithIL{{.*}}phProgram{{.*}}([[PROG1:.*]])) + // Cuda/Hip programs will be created with Binary, spirv IL. + // CHECK: urProgramCreateWith{{IL|Binary}}{{.*}}phProgram{{.*}}([[PROG1:.*]])) { std::cout << "Scope1\n"; // CHECK: Scope1 exe_kb kbExe2 = syclex::build(kbSrc); // kbExe2 goes out of scope; its kernels are removed from program mananager. - // CHECK: urProgramCreateWithIL{{.*}}phProgram{{.*}}([[PROG2:.*]])) + // CHECK: urProgramCreateWith{{IL|Binary}}{{.*}}phProgram{{.*}}([[PROG2:.*]])) // CHECK: urProgramRelease{{.*}}[[PROG2]] } std::cout << "End Scope1\n"; diff --git a/sycl/test-e2e/KernelCompiler/sycl_link.cpp b/sycl/test-e2e/KernelCompiler/sycl_link.cpp index 7187488d0f0d..1453e02b773d 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_link.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_link.cpp @@ -12,6 +12,8 @@ // -- Test for a simple linking case with source files compiled from SYCL source // -- at runtime. +// Note linking is not supported on CUDA/HIP. + // RUN: %{build} -o %t.out // RUN: %{run} %t.out // RUN: %{l0_leak_check} %{run} %t.out diff --git a/sycl/test-e2e/KernelCompiler/sycl_link_common_dep.cpp b/sycl/test-e2e/KernelCompiler/sycl_link_common_dep.cpp index b1d66710c2fa..f639a9bb3a33 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_link_common_dep.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_link_common_dep.cpp @@ -9,6 +9,8 @@ // REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_shared_allocations +// Note linking is not supported on CUDA/HIP. + // -- Test for linking where two kernels use the same imported symbols. // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/KernelCompiler/sycl_link_common_dep_optional_feature.cpp b/sycl/test-e2e/KernelCompiler/sycl_link_common_dep_optional_feature.cpp index 4976a9066bc3..5183a927881e 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_link_common_dep_optional_feature.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_link_common_dep_optional_feature.cpp @@ -9,6 +9,8 @@ // REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_shared_allocations +// Note linking is not supported on CUDA/HIP. + // -- Test for linking where two kernels use the same imported symbols, but one // -- may not be supported on the device. diff --git a/sycl/test-e2e/KernelCompiler/sycl_link_export_conflict.cpp b/sycl/test-e2e/KernelCompiler/sycl_link_export_conflict.cpp index 4f81eeb684fe..0739a5d9da00 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_link_export_conflict.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_link_export_conflict.cpp @@ -9,6 +9,8 @@ // REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_shared_allocations +// Note linking is not supported on CUDA/HIP. + // -- Test for a case where a kernel bundle with an exported symbol is compiled // -- before another kernel bundle using a different variant of the symbol. diff --git a/sycl/test-e2e/KernelCompiler/sycl_link_kernel_conflict.cpp b/sycl/test-e2e/KernelCompiler/sycl_link_kernel_conflict.cpp index 41b8c99c137a..0636fbff37bf 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_link_kernel_conflict.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_link_kernel_conflict.cpp @@ -9,6 +9,8 @@ // REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_shared_allocations +// Note linking is not supported on CUDA/HIP. + // -- Test for the linking of two kernels with conflicting definitions of // -- kernels with the same name. diff --git a/sycl/test-e2e/KernelCompiler/sycl_namespaces.cpp b/sycl/test-e2e/KernelCompiler/sycl_namespaces.cpp index 89ad5f5e67c4..29c54161506f 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_namespaces.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_namespaces.cpp @@ -6,11 +6,10 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_device_allocations // RUN: %{build} -o %t.out -// RUN: %{l0_leak_check} %{run} %t.out +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out #include #include diff --git a/sycl/test-e2e/KernelCompiler/sycl_overload.cpp b/sycl/test-e2e/KernelCompiler/sycl_overload.cpp index 2e7b5fee432a..a1b6cb787d03 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_overload.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_overload.cpp @@ -6,11 +6,10 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_device_allocations // RUN: %{build} -o %t.out -// RUN: %{l0_leak_check} %{run} %t.out +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out #include #include diff --git a/sycl/test-e2e/KernelCompiler/sycl_time_trace.cpp b/sycl/test-e2e/KernelCompiler/sycl_time_trace.cpp index d3916dfc3803..4b1a04d1bcb8 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_time_trace.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_time_trace.cpp @@ -5,9 +5,9 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) + // RUN: %{build} -o %t.out -// RUN: %{run} %t.out | FileCheck %s +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{run} %t.out | FileCheck %s #include #include diff --git a/sycl/test-e2e/format.py b/sycl/test-e2e/format.py index 44da0ef1c52c..ff2f8e36ecfb 100644 --- a/sycl/test-e2e/format.py +++ b/sycl/test-e2e/format.py @@ -244,6 +244,7 @@ def execute(self, test, litConfig): ) sycl_target_opts += hip_arch_opts substitutions.append(("%{hip_arch_opts}", hip_arch_opts)) + substitutions.append(("%{amd_arch}", test.config.amd_arch)) if ( "target-spir" in build_targets and "spirv-backend" in test.config.available_features diff --git a/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp b/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp index 429234206818..7f3757fc7062 100644 --- a/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp +++ b/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp @@ -6,7 +6,7 @@ // CHECK-DAG: README.md // CHECK-DAG: lit.cfg.py // -// CHECK-NUM-MATCHES: 28 +// CHECK-NUM-MATCHES: 29 // // This test verifies that `` isn't used in E2E tests. Instead, // fine-grained includes should used, see From 90d973632272d67f15f4eef0cf6a95c5ed19055b Mon Sep 17 00:00:00 2001 From: Julian Oppermann Date: Tue, 8 Jul 2025 10:14:55 +0100 Subject: [PATCH 02/10] WIP HIP device libraries via offload toolchain Signed-off-by: Julian Oppermann --- .../lib/rtc/DeviceCompilation.cpp | 74 ++++++++++++++----- 1 file changed, 56 insertions(+), 18 deletions(-) diff --git a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp index ade0161991b2..e30076c0e6d6 100644 --- a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp +++ b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp @@ -11,7 +11,6 @@ #include "JITBinaryInfo.h" #include "translation/Translation.h" -#include #include #include #include @@ -20,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -58,6 +58,7 @@ using namespace llvm::opt; using namespace llvm::sycl; using namespace llvm::module_split; using namespace llvm::util; +using namespace llvm::vfs; using namespace jit_compiler; #ifdef _GNU_SOURCE @@ -318,10 +319,8 @@ class LLVMDiagnosticWrapper : public llvm::DiagnosticHandler { } // anonymous namespace -static void adjustArgs(const InputArgList &UserArgList, - const std::string &DPCPPRoot, BinaryFormat Format, - SmallVectorImpl &CommandLine) { - DerivedArgList DAL{UserArgList}; +static void addRTCArgs(DerivedArgList &DAL, const std::string &DPCPPRoot, + BinaryFormat Format) { const auto &OptTable = getDriverOptTable(); DAL.AddFlagArg(nullptr, OptTable.getOption(OPT_fsycl_device_only)); DAL.AddJoinedArg( @@ -349,14 +348,27 @@ static void adjustArgs(const InputArgList &UserArgList, DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_cuda_gpu_arch_EQ), CPU); } } +} + +static void renderArgs(const DerivedArgList &DAL, + SmallVectorImpl &CommandLine) { ArgStringList ASL; for_each(DAL, [&DAL, &ASL](Arg *A) { A->render(DAL, ASL); }); + const auto &UserArgList = DAL.getBaseArgs(); for_each(UserArgList, [&UserArgList, &ASL](Arg *A) { A->render(UserArgList, ASL); }); transform(ASL, std::back_inserter(CommandLine), [](const char *AS) { return std::string{AS}; }); } +static void adjustArgs(const InputArgList &UserArgList, + const std::string &DPCPPRoot, BinaryFormat Format, + SmallVectorImpl &CommandLine) { + DerivedArgList DAL{UserArgList}; + addRTCArgs(DAL, DPCPPRoot, Format); + renderArgs(DAL, CommandLine); +} + static void setupTool(ClangTool &Tool, const std::string &DPCPPRoot, InMemoryFile SourceFile, View IncludeFiles, DiagnosticConsumer *Consumer) { @@ -517,17 +529,18 @@ static bool getDeviceLibraries(const ArgList &Args, using SYCLDeviceLibsList = SmallVector; const SYCLDeviceLibsList SYCLDeviceWrapperLibs = { - {"libsycl-crt", "libc"}, - {"libsycl-complex", "libm-fp32"}, - {"libsycl-complex-fp64", "libm-fp64"}, - {"libsycl-cmath", "libm-fp32"}, - {"libsycl-cmath-fp64", "libm-fp64"}, + {"libsycl-crt", "libc"}, + {"libsycl-complex", "libm-fp32"}, + {"libsycl-complex-fp64", "libm-fp64"}, + {"libsycl-cmath", "libm-fp32"}, + {"libsycl-cmath-fp64", "libm-fp64"}, #if defined(_WIN32) - {"libsycl-msvc-math", "libm-fp32"}, + {"libsycl-msvc-math", "libm-fp32"}, #endif - {"libsycl-imf", "libimf-fp32"}, - {"libsycl-imf-fp64", "libimf-fp64"}, - {"libsycl-imf-bf16", "libimf-bf16"}}; + {"libsycl-imf", "libimf-fp32"}, + {"libsycl-imf-fp64", "libimf-fp64"}, + {"libsycl-imf-bf16", "libimf-bf16"} + }; // ITT annotation libraries are linked in separately whenever the device // code instrumentation is enabled. const SYCLDeviceLibsList SYCLDeviceAnnotationLibs = { @@ -630,8 +643,32 @@ Error jit_compiler::linkDeviceLibraries(llvm::Module &Module, // For GPU targets we need to link against vendor provided libdevice. if (IsCudaHIP) { + std::string Argv0 = DPCPPRoot + "/bin/clang++"; Triple T{Module.getTargetTriple()}; - Driver D{(Twine(DPCPPRoot) + "/bin/clang++").str(), T.getTriple(), Diags}; + IntrusiveRefCntPtr OFS{ + new OverlayFileSystem{getRealFileSystem()}}; + IntrusiveRefCntPtr VFS{new InMemoryFileSystem}; + std::string CppFileName{"a.cpp"}; + VFS->addFile(CppFileName, /*ModificationTime=*/0, + MemoryBuffer::getMemBuffer("", "")); + OFS->pushOverlay(VFS); + Driver D{Argv0, T.getTriple(), Diags, "dpcpp compiler driver", OFS}; + + SmallVector CommandLine; + CommandLine.push_back(Argv0); + DerivedArgList AdjustedArgs{UserArgList}; + addRTCArgs(AdjustedArgs, DPCPPRoot, Format); + renderArgs(AdjustedArgs, CommandLine); + CommandLine.push_back(CppFileName); + SmallVector CommandLineCStr(CommandLine.size()); + llvm::transform(CommandLine, CommandLineCStr.begin(), + [](const auto &S) { return S.c_str(); }); + + Compilation *C = D.BuildCompilation(CommandLineCStr); + if (!C) { + return createStringError("Unable to construct driver for CUDA/HIP"); + } + auto [CPU, Features] = Translator::getTargetCPUAndFeatureAttrs(&Module, "", Format); (void)Features; @@ -662,9 +699,10 @@ Error jit_compiler::linkDeviceLibraries(llvm::Module &Module, } else { // AMDGPU requires entire toolchain in order to provide all common bitcode // libraries. - clang::driver::toolchains::ROCMToolChain TC(D, T, UserArgList); - auto CommonDeviceLibs = TC.getCommonDeviceLibNames( - UserArgList, CPU, Action::OffloadKind::OFK_SYCL, false); + const ToolChain *OffloadTC = + C->getSingleOffloadToolChain(); + auto CommonDeviceLibs = + OffloadTC->getDeviceLibs(AdjustedArgs, Action::OffloadKind::OFK_SYCL); if (CommonDeviceLibs.empty()) { return createStringError("Unable to find ROCm common device libraries"); } From ba63ac43fc0dc76fd0e8b47f30cc8785d578552a Mon Sep 17 00:00:00 2001 From: Julian Oppermann Date: Tue, 8 Jul 2025 11:01:56 +0100 Subject: [PATCH 03/10] Use -mcpu Signed-off-by: Julian Oppermann --- sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp index e30076c0e6d6..436aba5d1f9c 100644 --- a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp +++ b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp @@ -701,6 +701,10 @@ Error jit_compiler::linkDeviceLibraries(llvm::Module &Module, // libraries. const ToolChain *OffloadTC = C->getSingleOffloadToolChain(); + // `AdjustedArgs` already contains `--offload-arch=`, but that + // doesn't seem to be picked up by the logic called by `getDeviceLibs`. + AdjustedArgs.AddJoinedArg( + nullptr, getDriverOptTable().getOption(OPT_mcpu_EQ), CPU); auto CommonDeviceLibs = OffloadTC->getDeviceLibs(AdjustedArgs, Action::OffloadKind::OFK_SYCL); if (CommonDeviceLibs.empty()) { From 306ce760424501385c40951316574d7b6c15441f Mon Sep 17 00:00:00 2001 From: Julian Oppermann Date: Tue, 8 Jul 2025 11:03:22 +0100 Subject: [PATCH 04/10] Revert format change Signed-off-by: Julian Oppermann --- .../lib/rtc/DeviceCompilation.cpp | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp index 436aba5d1f9c..fc0bdeb0d652 100644 --- a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp +++ b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp @@ -529,18 +529,17 @@ static bool getDeviceLibraries(const ArgList &Args, using SYCLDeviceLibsList = SmallVector; const SYCLDeviceLibsList SYCLDeviceWrapperLibs = { - {"libsycl-crt", "libc"}, - {"libsycl-complex", "libm-fp32"}, - {"libsycl-complex-fp64", "libm-fp64"}, - {"libsycl-cmath", "libm-fp32"}, - {"libsycl-cmath-fp64", "libm-fp64"}, + {"libsycl-crt", "libc"}, + {"libsycl-complex", "libm-fp32"}, + {"libsycl-complex-fp64", "libm-fp64"}, + {"libsycl-cmath", "libm-fp32"}, + {"libsycl-cmath-fp64", "libm-fp64"}, #if defined(_WIN32) - {"libsycl-msvc-math", "libm-fp32"}, + {"libsycl-msvc-math", "libm-fp32"}, #endif - {"libsycl-imf", "libimf-fp32"}, - {"libsycl-imf-fp64", "libimf-fp64"}, - {"libsycl-imf-bf16", "libimf-bf16"} - }; + {"libsycl-imf", "libimf-fp32"}, + {"libsycl-imf-fp64", "libimf-fp64"}, + {"libsycl-imf-bf16", "libimf-bf16"}}; // ITT annotation libraries are linked in separately whenever the device // code instrumentation is enabled. const SYCLDeviceLibsList SYCLDeviceAnnotationLibs = { From 53e465534d00cfdc8fafa6c87080709483b6e9f0 Mon Sep 17 00:00:00 2001 From: Julian Oppermann Date: Fri, 11 Jul 2025 13:41:40 +0100 Subject: [PATCH 05/10] Implement CudaToolchain::getDeviceLibs Signed-off-by: Julian Oppermann --- clang/lib/Driver/ToolChains/Cuda.cpp | 18 +++ clang/lib/Driver/ToolChains/Cuda.h | 4 + .../lib/rtc/DeviceCompilation.cpp | 116 +++++++----------- .../lib/translation/Translation.cpp | 4 +- 4 files changed, 66 insertions(+), 76 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index d9a7689115f8..0e6734da15bb 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -1162,6 +1162,24 @@ void CudaToolChain::AddIAMCUIncludeArgs(const ArgList &Args, HostTC.AddIAMCUIncludeArgs(Args, CC1Args); } +llvm::SmallVector +CudaToolChain::getDeviceLibs( + const llvm::opt::ArgList &DriverArgs, + const Action::OffloadKind DeviceOffloadingKind) const { + if (!DriverArgs.hasFlag(options::OPT_offloadlib, options::OPT_no_offloadlib, + true)) + return {}; + + StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_march_EQ); + std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(GpuArch); + if (LibDeviceFile.empty()) { + getDriver().Diag(diag::err_drv_no_cuda_libdevice) << GpuArch; + return {}; + } + + return {BitCodeLibraryInfo{LibDeviceFile}}; +} + SanitizerMask CudaToolChain::getSupportedSanitizers() const { // The CudaToolChain only supports sanitizers in the sense that it allows // sanitizer arguments on the command line if they are supported by the host diff --git a/clang/lib/Driver/ToolChains/Cuda.h b/clang/lib/Driver/ToolChains/Cuda.h index 3fa95b6f3d2e..1b1eb59082c2 100644 --- a/clang/lib/Driver/ToolChains/Cuda.h +++ b/clang/lib/Driver/ToolChains/Cuda.h @@ -248,6 +248,10 @@ class LLVM_LIBRARY_VISIBILITY CudaToolChain : public NVPTXToolChain { void AddIAMCUIncludeArgs(const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args) const override; + llvm::SmallVector + getDeviceLibs(const llvm::opt::ArgList &Args, + const Action::OffloadKind DeviceOffloadingKind) const override; + SanitizerMask getSupportedSanitizers() const override; VersionTuple diff --git a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp index fc0bdeb0d652..1da24dbd4242 100644 --- a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp +++ b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp @@ -319,8 +319,10 @@ class LLVMDiagnosticWrapper : public llvm::DiagnosticHandler { } // anonymous namespace -static void addRTCArgs(DerivedArgList &DAL, const std::string &DPCPPRoot, - BinaryFormat Format) { +static void adjustArgs(const InputArgList &UserArgList, + const std::string &DPCPPRoot, BinaryFormat Format, + SmallVectorImpl &CommandLine) { + DerivedArgList DAL{UserArgList}; const auto &OptTable = getDriverOptTable(); DAL.AddFlagArg(nullptr, OptTable.getOption(OPT_fsycl_device_only)); DAL.AddJoinedArg( @@ -335,40 +337,21 @@ static void addRTCArgs(DerivedArgList &DAL, const std::string &DPCPPRoot, auto [CPU, Features] = Translator::getTargetCPUAndFeatureAttrs(nullptr, "", Format); (void)Features; - if (Format == BinaryFormat::AMDGCN) { - DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_fsycl_targets_EQ), - "amdgcn-amd-amdhsa"); - DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_Xsycl_backend_EQ), - "amdgcn-amd-amdhsa"); - DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_offload_arch_EQ), CPU); - } else { - DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_fsycl_targets_EQ), - "nvptx64-nvidia-cuda"); - DAL.AddFlagArg(nullptr, OptTable.getOption(OPT_Xsycl_backend)); - DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_cuda_gpu_arch_EQ), CPU); - } + StringRef OT = Format == BinaryFormat::PTX ? "nvptx64-nvidia-cuda" + : "amdgcn-amd-amdhsa"; + DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_fsycl_targets_EQ), OT); + DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_Xsycl_backend_EQ), OT); + DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_offload_arch_EQ), CPU); } -} -static void renderArgs(const DerivedArgList &DAL, - SmallVectorImpl &CommandLine) { ArgStringList ASL; for_each(DAL, [&DAL, &ASL](Arg *A) { A->render(DAL, ASL); }); - const auto &UserArgList = DAL.getBaseArgs(); for_each(UserArgList, [&UserArgList, &ASL](Arg *A) { A->render(UserArgList, ASL); }); transform(ASL, std::back_inserter(CommandLine), [](const char *AS) { return std::string{AS}; }); } -static void adjustArgs(const InputArgList &UserArgList, - const std::string &DPCPPRoot, BinaryFormat Format, - SmallVectorImpl &CommandLine) { - DerivedArgList DAL{UserArgList}; - addRTCArgs(DAL, DPCPPRoot, Format); - renderArgs(DAL, CommandLine); -} - static void setupTool(ClangTool &Tool, const std::string &DPCPPRoot, InMemoryFile SourceFile, View IncludeFiles, DiagnosticConsumer *Consumer) { @@ -655,9 +638,7 @@ Error jit_compiler::linkDeviceLibraries(llvm::Module &Module, SmallVector CommandLine; CommandLine.push_back(Argv0); - DerivedArgList AdjustedArgs{UserArgList}; - addRTCArgs(AdjustedArgs, DPCPPRoot, Format); - renderArgs(AdjustedArgs, CommandLine); + adjustArgs(UserArgList, DPCPPRoot, Format, CommandLine); CommandLine.push_back(CppFileName); SmallVector CommandLineCStr(CommandLine.size()); llvm::transform(CommandLine, CommandLineCStr.begin(), @@ -668,56 +649,43 @@ Error jit_compiler::linkDeviceLibraries(llvm::Module &Module, return createStringError("Unable to construct driver for CUDA/HIP"); } - auto [CPU, Features] = - Translator::getTargetCPUAndFeatureAttrs(&Module, "", Format); - (void)Features; - // Helper lambda to link modules. - auto LinkInLib = [&](const StringRef LibDevice) -> Error { - ModuleUPtr LibDeviceModule; - if (auto Error = loadBitcodeLibrary(LibDevice, Context) - .moveInto(LibDeviceModule)) { - return Error; - } - if (Linker::linkModules(Module, std::move(LibDeviceModule), - Linker::LinkOnlyNeeded)) { - return createStringError("Unable to link libdevice: %s", - BuildLog.c_str()); - } - return Error::success(); - }; - SmallVector LibDeviceFiles; + const ToolChain *OffloadTC = + C->getSingleOffloadToolChain(); + InputArgList EmptyArgList; + auto Archs = + D.getOffloadArchs(*C, EmptyArgList, Action::OFK_SYCL, OffloadTC); + assert(Archs.size() == 1 && + "Offload toolchain should be configured to single architecture"); + StringRef CPU = *Archs.begin(); + + // Pass only `-march=` or `-mcpu=` with the GPU arch determined by the + // driver to `getDeviceLibs`. + DerivedArgList CPUArgList{EmptyArgList}; if (Format == BinaryFormat::PTX) { - // For NVPTX we can get away with CudaInstallationDetector. - LazyDetector CudaInstallation{D, T, - UserArgList}; - auto LibDevice = CudaInstallation->getLibDeviceFile(CPU); - if (LibDevice.empty()) { - return createStringError("Unable to find Cuda libdevice"); - } - LibDeviceFiles.push_back(LibDevice); + CPUArgList.AddJoinedArg(nullptr, D.getOpts().getOption(OPT_march_EQ), + CPU); } else { - // AMDGPU requires entire toolchain in order to provide all common bitcode - // libraries. - const ToolChain *OffloadTC = - C->getSingleOffloadToolChain(); - // `AdjustedArgs` already contains `--offload-arch=`, but that - // doesn't seem to be picked up by the logic called by `getDeviceLibs`. - AdjustedArgs.AddJoinedArg( - nullptr, getDriverOptTable().getOption(OPT_mcpu_EQ), CPU); - auto CommonDeviceLibs = - OffloadTC->getDeviceLibs(AdjustedArgs, Action::OffloadKind::OFK_SYCL); - if (CommonDeviceLibs.empty()) { - return createStringError("Unable to find ROCm common device libraries"); - } - for (auto &Lib : CommonDeviceLibs) { - LibDeviceFiles.push_back(Lib.Path); - } + CPUArgList.AddJoinedArg(nullptr, D.getOpts().getOption(OPT_mcpu_EQ), CPU); + } + SmallVector CommonDeviceLibs = + OffloadTC->getDeviceLibs(CPUArgList, Action::OffloadKind::OFK_SYCL); + + if (CommonDeviceLibs.empty()) { + return createStringError("Unable to find common device libraries"); } - for (auto &LibDeviceFile : LibDeviceFiles) { - // llvm::Error converts to false on success. - if (auto Error = LinkInLib(LibDeviceFile)) { + for (auto &Lib : CommonDeviceLibs) { + + ModuleUPtr LibModule; + if (auto Error = + loadBitcodeLibrary(Lib.Path, Context).moveInto(LibModule)) { return Error; } + + if (Linker::linkModules(Module, std::move(LibModule), + Linker::LinkOnlyNeeded)) { + return createStringError("Unable to link device library %s: %s", + Lib.Path.c_str(), BuildLog.c_str()); + } } } diff --git a/sycl-jit/jit-compiler/lib/translation/Translation.cpp b/sycl-jit/jit-compiler/lib/translation/Translation.cpp index 99eaffd3384d..4acd6c81c712 100644 --- a/sycl-jit/jit-compiler/lib/translation/Translation.cpp +++ b/sycl-jit/jit-compiler/lib/translation/Translation.cpp @@ -192,8 +192,8 @@ std::pair Translator::getTargetCPUAndFeatureAttrs( auto *KernelFunc = (M && KernelName) ? M->getFunction(KernelName) : nullptr; auto CPUVal = ConfigHelper::get(); auto FeaturesVal = ConfigHelper::get(); - llvm::StringRef CPU = CPUVal.begin(); - llvm::StringRef Features = FeaturesVal.begin(); + llvm::StringRef CPU{CPUVal.begin(), CPUVal.size()}; + llvm::StringRef Features{FeaturesVal.begin(), FeaturesVal.size()}; if (CPU.empty()) { // Set to the lowest tested target according to the GetStartedGuide, section // "Build DPC++ toolchain with support for HIP AMD" From 3c2a6b1c1738aa30a4e8188bb635d02ca9c366bd Mon Sep 17 00:00:00 2001 From: Julian Oppermann Date: Fri, 11 Jul 2025 13:52:41 +0100 Subject: [PATCH 06/10] Drop clang/lib import Signed-off-by: Julian Oppermann --- sycl-jit/jit-compiler/CMakeLists.txt | 1 - sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/sycl-jit/jit-compiler/CMakeLists.txt b/sycl-jit/jit-compiler/CMakeLists.txt index 9e97d2c8ccfd..fabf64546e9d 100644 --- a/sycl-jit/jit-compiler/CMakeLists.txt +++ b/sycl-jit/jit-compiler/CMakeLists.txt @@ -60,7 +60,6 @@ target_include_directories(sycl-jit ${LLVM_MAIN_INCLUDE_DIR} ${LLVM_SPIRV_INCLUDE_DIRS} ${LLVM_EXTERNAL_CLANG_SOURCE_DIR}/include - ${LLVM_EXTERNAL_CLANG_SOURCE_DIR}/lib ${CMAKE_BINARY_DIR}/tools/clang/include ) target_include_directories(sycl-jit diff --git a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp index 1da24dbd4242..b7a045e5529d 100644 --- a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp +++ b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp @@ -11,8 +11,6 @@ #include "JITBinaryInfo.h" #include "translation/Translation.h" -#include -#include #include #include #include @@ -667,9 +665,9 @@ Error jit_compiler::linkDeviceLibraries(llvm::Module &Module, } else { CPUArgList.AddJoinedArg(nullptr, D.getOpts().getOption(OPT_mcpu_EQ), CPU); } + SmallVector CommonDeviceLibs = OffloadTC->getDeviceLibs(CPUArgList, Action::OffloadKind::OFK_SYCL); - if (CommonDeviceLibs.empty()) { return createStringError("Unable to find common device libraries"); } From e232425eb205bd408145dc9e853c6909fc7ef5dd Mon Sep 17 00:00:00 2001 From: Julian Oppermann Date: Fri, 11 Jul 2025 16:23:03 +0100 Subject: [PATCH 07/10] Bump counter Signed-off-by: Julian Oppermann --- sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp b/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp index 7f3757fc7062..6d41cff6dc4c 100644 --- a/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp +++ b/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp @@ -6,7 +6,7 @@ // CHECK-DAG: README.md // CHECK-DAG: lit.cfg.py // -// CHECK-NUM-MATCHES: 29 +// CHECK-NUM-MATCHES: 30 // // This test verifies that `` isn't used in E2E tests. Instead, // fine-grained includes should used, see From 37dd687c0f6520124b4f232294f628e8ec44c243 Mon Sep 17 00:00:00 2001 From: Julian Oppermann Date: Fri, 11 Jul 2025 16:29:39 +0100 Subject: [PATCH 08/10] KISS Signed-off-by: Julian Oppermann --- clang/lib/Driver/ToolChains/Cuda.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index 0e6734da15bb..b11ca93c7fe3 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -1166,10 +1166,6 @@ llvm::SmallVector CudaToolChain::getDeviceLibs( const llvm::opt::ArgList &DriverArgs, const Action::OffloadKind DeviceOffloadingKind) const { - if (!DriverArgs.hasFlag(options::OPT_offloadlib, options::OPT_no_offloadlib, - true)) - return {}; - StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_march_EQ); std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(GpuArch); if (LibDeviceFile.empty()) { From 66986e260719ec37e34126fae6849a483101a347 Mon Sep 17 00:00:00 2001 From: Julian Oppermann Date: Mon, 14 Jul 2025 08:24:01 +0100 Subject: [PATCH 09/10] WS Signed-off-by: Julian Oppermann --- sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp index b7a045e5529d..902071a1fe2b 100644 --- a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp +++ b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp @@ -671,8 +671,8 @@ Error jit_compiler::linkDeviceLibraries(llvm::Module &Module, if (CommonDeviceLibs.empty()) { return createStringError("Unable to find common device libraries"); } - for (auto &Lib : CommonDeviceLibs) { + for (auto &Lib : CommonDeviceLibs) { ModuleUPtr LibModule; if (auto Error = loadBitcodeLibrary(Lib.Path, Context).moveInto(LibModule)) { From f00d2a792c38e272e76213697e0d424f885b669e Mon Sep 17 00:00:00 2001 From: Julian Oppermann Date: Tue, 15 Jul 2025 12:29:57 +0100 Subject: [PATCH 10/10] Nit Signed-off-by: Julian Oppermann --- sycl/source/kernel_bundle.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sycl/source/kernel_bundle.cpp b/sycl/source/kernel_bundle.cpp index 59c089a8ac95..c18bdbc46a62 100644 --- a/sycl/source/kernel_bundle.cpp +++ b/sycl/source/kernel_bundle.cpp @@ -409,10 +409,10 @@ bool is_source_kernel_bundle_supported( const std::vector &DeviceImplVec) { backend BE = DeviceImplVec[0]->getBackend(); // Support is limited to the opencl and level_zero backends. - bool BE_Acceptable = - (BE == sycl::backend::ext_oneapi_level_zero) || - (BE == sycl::backend::opencl || BE == sycl::backend::ext_oneapi_hip || - BE == sycl::backend::ext_oneapi_cuda); + bool BE_Acceptable = BE == sycl::backend::ext_oneapi_level_zero || + BE == sycl::backend::opencl || + BE == sycl::backend::ext_oneapi_hip || + BE == sycl::backend::ext_oneapi_cuda; if (!BE_Acceptable) return false;