From de5084d55b4ece0103451d985a3cf3ad87b578f2 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Tue, 29 Apr 2025 10:08:01 +0000 Subject: [PATCH 01/26] [SYCL] RTC support for GPU targets This patch extends RTC support to GPU (AMD and Nvidia) targets. Additionally: * reinstate __SYCL_PROGRAM_METADATA_TAG_NEED_FINALIZATION tag, * split sycl.cpp RTC file to exclude IMF from the body of the main test. --- sycl-jit/jit-compiler/CMakeLists.txt | 1 + sycl-jit/jit-compiler/include/RTC.h | 11 +- .../lib/rtc/DeviceCompilation.cpp | 143 ++++++++++++++++-- .../jit-compiler/lib/rtc/DeviceCompilation.h | 8 +- sycl-jit/jit-compiler/lib/rtc/RTC.cpp | 22 +-- .../lib/translation/Translation.cpp | 89 +++++------ .../lib/translation/Translation.h | 4 +- sycl/source/detail/device_image_impl.hpp | 34 ++++- sycl/source/detail/jit_compiler.cpp | 57 +++++-- sycl/source/detail/jit_compiler.hpp | 4 +- .../kernel_compiler/kernel_compiler_sycl.cpp | 5 +- .../kernel_compiler/kernel_compiler_sycl.hpp | 3 +- sycl/source/kernel_bundle.cpp | 6 +- sycl/test-e2e/KernelCompiler/sycl.cpp | 16 +- sycl/test-e2e/KernelCompiler/sycl_imf.cpp | 80 ++++++++++ 15 files changed, 364 insertions(+), 119 deletions(-) create mode 100644 sycl/test-e2e/KernelCompiler/sycl_imf.cpp diff --git a/sycl-jit/jit-compiler/CMakeLists.txt b/sycl-jit/jit-compiler/CMakeLists.txt index fabf64546e9da..9e97d2c8ccfdb 100644 --- a/sycl-jit/jit-compiler/CMakeLists.txt +++ b/sycl-jit/jit-compiler/CMakeLists.txt @@ -60,6 +60,7 @@ target_include_directories(sycl-jit ${LLVM_MAIN_INCLUDE_DIR} ${LLVM_SPIRV_INCLUDE_DIRS} ${LLVM_EXTERNAL_CLANG_SOURCE_DIR}/include + ${LLVM_EXTERNAL_CLANG_SOURCE_DIR}/lib ${CMAKE_BINARY_DIR}/tools/clang/include ) target_include_directories(sycl-jit diff --git a/sycl-jit/jit-compiler/include/RTC.h b/sycl-jit/jit-compiler/include/RTC.h index a1a39d5b50ad3..6a690df459646 100644 --- a/sycl-jit/jit-compiler/include/RTC.h +++ b/sycl-jit/jit-compiler/include/RTC.h @@ -176,10 +176,11 @@ class RTCResult { /// Calculates a BLAKE3 hash of the pre-processed source string described by /// \p SourceFile (considering any additional \p IncludeFiles) and the -/// concatenation of the \p UserArgs. +/// concatenation of the \p UserArgs, for a given \p Format. JIT_EXPORT_SYMBOL RTCHashResult calculateHash(InMemoryFile SourceFile, View IncludeFiles, - View UserArgs); + View UserArgs, + BinaryFormat Format); /// Compiles, links against device libraries, and finalizes the device code in /// the source string described by \p SourceFile, considering any additional \p @@ -191,10 +192,14 @@ JIT_EXPORT_SYMBOL RTCHashResult calculateHash(InMemoryFile SourceFile, /// /// If \p SaveIR is true and \p CachedIR is empty, the LLVM module obtained from /// the frontend invocation is wrapped in bitcode format in the result object. +/// +/// \p BinaryFormat describes the desired format of the compilation - which +/// corresponds to the backend that is being targeted. JIT_EXPORT_SYMBOL RTCResult compileSYCL(InMemoryFile SourceFile, View IncludeFiles, View UserArgs, - View CachedIR, bool SaveIR); + View CachedIR, bool SaveIR, + BinaryFormat Format); /// Requests that the JIT binary referenced by \p Address is deleted from the /// `JITContext`. diff --git a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp index 6a39b79034e7b..de94eeff2afe8 100644 --- a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp +++ b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp @@ -8,6 +8,8 @@ #include "DeviceCompilation.h" #include "ESIMD.h" +#include "JITBinaryInfo.h" +#include "translation/Translation.h" #include #include @@ -22,6 +24,15 @@ #include #include #include +#if defined(JIT_SUPPORT_PTX) || defined(JIT_SUPPORT_AMDGCN) +#include +#endif +#ifdef JIT_SUPPORT_PTX +#include +#include +#elif JIT_SUPPORT_AMDGCN +#include +#endif #include #include @@ -178,7 +189,8 @@ class RTCToolActionBase : public ToolAction { assert(!hasExecuted() && "Action should only be invoked on a single file"); // Create a compiler instance to handle the actual work. - CompilerInstance Compiler(std::move(Invocation), std::move(PCHContainerOps)); + CompilerInstance Compiler(std::move(Invocation), + std::move(PCHContainerOps)); Compiler.setFileManager(Files); // Suppress summary with number of warnings and errors being printed to // stdout. @@ -361,10 +373,24 @@ static void setupTool(ClangTool &Tool, const std::string &DPCPPRoot, }); } -Expected -jit_compiler::calculateHash(InMemoryFile SourceFile, - View IncludeFiles, - const InputArgList &UserArgList) { +static void setGPUTarget(BinaryFormat Format, + SmallVector &CommandLine) { + auto [CPU, _] = Translator::getTargetCPUAndFeatureAttrs(nullptr, "", Format); + CommandLine.push_back("-fsycl"); + if (Format == BinaryFormat::PTX) { + CommandLine.push_back("-fsycl-targets=nvptx64-nvidia-cuda"); + CommandLine.push_back("-Xsycl-target-backend"); + CommandLine.push_back("--cuda-gpu-arch=" + CPU); + } else if (Format == BinaryFormat::AMDGCN) { + CommandLine.push_back("-fsycl-targets=amdgcn-amd-amdhsa"); + CommandLine.push_back("-Xsycl-target-backend=amdgcn-amd-amdhsa"); + CommandLine.push_back("--offload-arch=" + CPU); + } +} + +Expected jit_compiler::calculateHash( + InMemoryFile SourceFile, View IncludeFiles, + const InputArgList &UserArgList, BinaryFormat Format) { TimeTraceScope TTS{"calculateHash"}; const std::string &DPCPPRoot = getDPCPPRoot(); @@ -373,6 +399,9 @@ jit_compiler::calculateHash(InMemoryFile SourceFile, } SmallVector CommandLine; + if (Format == BinaryFormat::PTX || Format == BinaryFormat::AMDGCN) { + setGPUTarget(Format, CommandLine); + } adjustArgs(UserArgList, DPCPPRoot, CommandLine); FixedCompilationDatabase DB{".", CommandLine}; @@ -399,11 +428,10 @@ jit_compiler::calculateHash(InMemoryFile SourceFile, return createStringError("Calculating source hash failed"); } -Expected -jit_compiler::compileDeviceCode(InMemoryFile SourceFile, - View IncludeFiles, - const InputArgList &UserArgList, - std::string &BuildLog, LLVMContext &Context) { +Expected jit_compiler::compileDeviceCode( + InMemoryFile SourceFile, View IncludeFiles, + const InputArgList &UserArgList, std::string &BuildLog, + LLVMContext &Context, BinaryFormat Format) { TimeTraceScope TTS{"compileDeviceCode"}; const std::string &DPCPPRoot = getDPCPPRoot(); @@ -412,6 +440,9 @@ jit_compiler::compileDeviceCode(InMemoryFile SourceFile, } SmallVector CommandLine; + if (Format == BinaryFormat::PTX || Format == BinaryFormat::AMDGCN) { + setGPUTarget(Format, CommandLine); + } adjustArgs(UserArgList, DPCPPRoot, CommandLine); FixedCompilationDatabase DB{".", CommandLine}; @@ -430,12 +461,22 @@ jit_compiler::compileDeviceCode(InMemoryFile SourceFile, return createStringError(BuildLog); } -// This function is a simplified copy of the device library selection process in -// `clang::driver::tools::SYCL::getDeviceLibraries`, assuming a SPIR-V target -// (no AoT, no third-party GPUs, no native CPU). Keep in sync! +// This function is a simplified copy of the device library selection process +// in `clang::driver::tools::SYCL::getDeviceLibraries`, assuming a SPIR-V, or +// GPU targets (no AoT, no native CPU). Keep in sync! static bool getDeviceLibraries(const ArgList &Args, SmallVectorImpl &LibraryList, - DiagnosticsEngine &Diags) { + DiagnosticsEngine &Diags, BinaryFormat Format) { + // For CUDA/HIP we only need devicelib, early exit here. + if (Format == BinaryFormat::PTX) { + LibraryList.push_back( + Args.MakeArgString("devicelib-nvptx64-nvidia-cuda.bc")); + return false; + } else if (Format == BinaryFormat::AMDGCN) { + LibraryList.push_back(Args.MakeArgString("devicelib-amdgcn-amd-amdhsa.bc")); + return false; + } + struct DeviceLibOptInfo { StringRef DeviceLibName; StringRef DeviceLibOption; @@ -540,7 +581,8 @@ static Expected loadBitcodeLibrary(StringRef LibPath, Error jit_compiler::linkDeviceLibraries(llvm::Module &Module, const InputArgList &UserArgList, - std::string &BuildLog) { + std::string &BuildLog, + BinaryFormat Format) { TimeTraceScope TTS{"linkDeviceLibraries"}; const std::string &DPCPPRoot = getDPCPPRoot(); @@ -555,11 +597,29 @@ Error jit_compiler::linkDeviceLibraries(llvm::Module &Module, /* ShouldOwnClient=*/false); SmallVector LibNames; - bool FoundUnknownLib = getDeviceLibraries(UserArgList, LibNames, Diags); + const bool FoundUnknownLib = + getDeviceLibraries(UserArgList, LibNames, Diags, Format); if (FoundUnknownLib) { return createStringError("Could not determine list of device libraries: %s", BuildLog.c_str()); } + const bool IsGPUTarget = + Format == BinaryFormat::PTX || Format == BinaryFormat::AMDGCN; + if (IsGPUTarget) { + // Based on the OS and the format decide on the version of libspirv. + // NOTE: this will be problematic if cross-compiling between OSes. + std::string Libclc{"clc/"}; + Libclc.append( +#ifdef _WIN32 + "remangled-l32-signed_char.libspirv-" +#else + "remangled-l64-signed_char.libspirv-" +#endif + ); + Libclc.append(Format == BinaryFormat::PTX ? "nvptx64-nvidia-cuda.bc" + : "amdgcn-amd-amdhsa.bc"); + LibNames.push_back(Libclc); + } LLVMContext &Context = Module.getContext(); for (const std::string &LibName : LibNames) { @@ -577,6 +637,57 @@ Error jit_compiler::linkDeviceLibraries(llvm::Module &Module, } } + // For GPU targets we need to link against vendor provided libdevice. + if (IsGPUTarget) { + Triple T{Module.getTargetTriple()}; + Driver D{(Twine(DPCPPRoot) + "/bin/clang++").str(), T.getTriple(), Diags}; + auto [CPU, _] = + Translator::getTargetCPUAndFeatureAttrs(&Module, "", Format); + // Helper lambda to link modules. + auto LinkInLib = [&](const StringRef LibDevice) -> Error { + ModuleUPtr LibDeviceModule; + if (auto Error = loadBitcodeLibrary(LibDevice, Context) + .moveInto(LibDeviceModule)) { + return Error; + } + if (Linker::linkModules(Module, std::move(LibDeviceModule), + Linker::LinkOnlyNeeded)) { + return createStringError("Unable to link libdevice: %s", + BuildLog.c_str()); + } + return Error::success(); + }; + SmallVector LibDeviceFiles; +#ifdef JIT_SUPPORT_PTX + // For NVPTX we can get away with CudaInstallationDetector. + LazyDetector CudaInstallation{D, T, UserArgList}; + auto LibDevice = CudaInstallation->getLibDeviceFile(CPU); + if (LibDevice.empty()) { + return createStringError("Unable to find Cuda libdevice"); + } + LibDeviceFiles.push_back(LibDevice); +#elif JIT_SUPPORT_AMDGCN + // AMDGPU requires entire toolchain in order to provide all common bitcode + // libraries. + clang::driver::toolchains::ROCMToolChain TC(D, T, UserArgList); + auto CommonDeviceLibs = TC.getCommonDeviceLibNames( + UserArgList, CPU, Action::OffloadKind::OFK_SYCL, false); + if (CommonDeviceLibs.empty()) { + return createStringError("Unable to find ROCm common device libraries"); + } + for (auto &Lib : CommonDeviceLibs) { + LibDeviceFiles.push_back(Lib.Path); + } +#endif + for (auto &LibDeviceFile : LibDeviceFiles) { + auto Res = LinkInLib(LibDeviceFile); + // llvm::Error converts to false on success. + if (Res) { + return Res; + } + } + } + return Error::success(); } diff --git a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.h b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.h index 62da2cdb54bf1..aa1b19df0cc8f 100644 --- a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.h +++ b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.h @@ -8,6 +8,7 @@ #pragma once +#include "JITBinaryInfo.h" #include "RTC.h" #include @@ -24,16 +25,17 @@ using ModuleUPtr = std::unique_ptr; llvm::Expected calculateHash(InMemoryFile SourceFile, View IncludeFiles, - const llvm::opt::InputArgList &UserArgList); + const llvm::opt::InputArgList &UserArgList, BinaryFormat Format); llvm::Expected compileDeviceCode(InMemoryFile SourceFile, View IncludeFiles, const llvm::opt::InputArgList &UserArgList, - std::string &BuildLog, llvm::LLVMContext &Context); + std::string &BuildLog, llvm::LLVMContext &Context, + BinaryFormat Format); llvm::Error linkDeviceLibraries(llvm::Module &Module, const llvm::opt::InputArgList &UserArgList, - std::string &BuildLog); + std::string &BuildLog, BinaryFormat Format); using PostLinkResult = std::pair>; llvm::Expected diff --git a/sycl-jit/jit-compiler/lib/rtc/RTC.cpp b/sycl-jit/jit-compiler/lib/rtc/RTC.cpp index e9307ccb27eed..2e42bca3a49a5 100644 --- a/sycl-jit/jit-compiler/lib/rtc/RTC.cpp +++ b/sycl-jit/jit-compiler/lib/rtc/RTC.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "RTC.h" +#include "JITBinaryInfo.h" #include "helper/ErrorHelper.h" #include "rtc/DeviceCompilation.h" #include "translation/SPIRVLLVMTranslation.h" @@ -26,7 +27,8 @@ using namespace jit_compiler; JIT_EXPORT_SYMBOL RTCHashResult calculateHash(InMemoryFile SourceFile, View IncludeFiles, - View UserArgs) { + View UserArgs, + BinaryFormat Format) { llvm::opt::InputArgList UserArgList; if (auto Error = parseUserArgs(UserArgs).moveInto(UserArgList)) { return errorTo(std::move(Error), @@ -36,8 +38,8 @@ JIT_EXPORT_SYMBOL RTCHashResult calculateHash(InMemoryFile SourceFile, auto Start = std::chrono::high_resolution_clock::now(); std::string Hash; - if (auto Error = - calculateHash(SourceFile, IncludeFiles, UserArgList).moveInto(Hash)) { + if (auto Error = calculateHash(SourceFile, IncludeFiles, UserArgList, Format) + .moveInto(Hash)) { return errorTo(std::move(Error), "Hashing failed", /*IsHash=*/false); } @@ -55,7 +57,8 @@ JIT_EXPORT_SYMBOL RTCHashResult calculateHash(InMemoryFile SourceFile, JIT_EXPORT_SYMBOL RTCResult compileSYCL(InMemoryFile SourceFile, View IncludeFiles, View UserArgs, - View CachedIR, bool SaveIR) { + View CachedIR, bool SaveIR, + BinaryFormat Format) { llvm::LLVMContext Context; std::string BuildLog; configureDiagnostics(Context, BuildLog); @@ -104,7 +107,7 @@ JIT_EXPORT_SYMBOL RTCResult compileSYCL(InMemoryFile SourceFile, bool FromSource = !Module; if (FromSource) { if (auto Error = compileDeviceCode(SourceFile, IncludeFiles, UserArgList, - BuildLog, Context) + BuildLog, Context, Format) .moveInto(Module)) { return errorTo(std::move(Error), "Device compilation failed"); } @@ -118,7 +121,8 @@ JIT_EXPORT_SYMBOL RTCResult compileSYCL(InMemoryFile SourceFile, IR = RTCDeviceCodeIR{BCString.data(), BCString.data() + BCString.size()}; } - if (auto Error = linkDeviceLibraries(*Module, UserArgList, BuildLog)) { + if (auto Error = + linkDeviceLibraries(*Module, UserArgList, BuildLog, Format)) { return errorTo(std::move(Error), "Device linking failed"); } @@ -131,9 +135,9 @@ JIT_EXPORT_SYMBOL RTCResult compileSYCL(InMemoryFile SourceFile, for (auto [DevImgInfo, Module] : llvm::zip_equal(BundleInfo.DevImgInfos, Modules)) { - if (auto Error = Translator::translate(*Module, JITContext::getInstance(), - BinaryFormat::SPIRV) - .moveInto(DevImgInfo.BinaryInfo)) { + if (auto Error = + Translator::translate(*Module, JITContext::getInstance(), Format) + .moveInto(DevImgInfo.BinaryInfo)) { return errorTo(std::move(Error), "SPIR-V translation failed"); } } diff --git a/sycl-jit/jit-compiler/lib/translation/Translation.cpp b/sycl-jit/jit-compiler/lib/translation/Translation.cpp index 457b6bfc0e637..99eaffd3384d1 100644 --- a/sycl-jit/jit-compiler/lib/translation/Translation.cpp +++ b/sycl-jit/jit-compiler/lib/translation/Translation.cpp @@ -83,9 +83,6 @@ llvm::Expected Translator::translateToPTX(llvm::Module &Mod, LLVMInitializeNVPTXAsmPrinter(); LLVMInitializeNVPTXTargetMC(); - static const char *TARGET_CPU_ATTRIBUTE = "target-cpu"; - static const char *TARGET_FEATURE_ATTRIBUTE = "target-features"; - std::string TargetTriple{"nvptx64-nvidia-cuda"}; std::string ErrorMessage; @@ -99,32 +96,11 @@ llvm::Expected Translator::translateToPTX(llvm::Module &Mod, ErrorMessage.c_str()); } - // Give priority to user specified values (through environment variables: - // SYCL_JIT_AMDGCN_PTX_TARGET_CPU and SYCL_JIT_AMDGCN_PTX_TARGET_FEATURES). - auto CPUVal = ConfigHelper::get(); - auto FeaturesVal = ConfigHelper::get(); - llvm::StringRef CPU = CPUVal.begin(); - llvm::StringRef Features = FeaturesVal.begin(); - - auto *KernelFunc = KernelName ? Mod.getFunction(KernelName) : nullptr; - // If they were not set, use default and consult the module for alternatives - // (if present). - if (CPU.empty()) { - CPU = "sm_50"; - if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) { - CPU = KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString(); - } - } - if (Features.empty()) { - Features = "+sm_50,+ptx76"; - if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) { - Features = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE) - .getValueAsString(); - } - } + auto [CPU, Features] = + getTargetCPUAndFeatureAttrs(&Mod, KernelName, BinaryFormat::PTX); std::unique_ptr TargetMachine(Target->createTargetMachine( - Mod.getTargetTriple(), CPU, Features, {}, llvm::Reloc::PIC_, std::nullopt, + Triple{TargetTriple}, CPU, Features, {}, llvm::Reloc::PIC_, std::nullopt, llvm::CodeGenOptLevel::Default)); llvm::legacy::PassManager PM; @@ -166,9 +142,6 @@ Translator::translateToAMDGCN(llvm::Module &Mod, JITContext &JITCtx, LLVMInitializeAMDGPUAsmPrinter(); LLVMInitializeAMDGPUTargetMC(); - static const char *TARGET_CPU_ATTRIBUTE = "target-cpu"; - static const char *TARGET_FEATURE_ATTRIBUTE = "target-features"; - std::string TargetTriple{"amdgcn-amd-amdhsa"}; std::string ErrorMessage; @@ -181,29 +154,10 @@ Translator::translateToAMDGCN(llvm::Module &Mod, JITContext &JITCtx, "Failed to load and translate AMDGCN LLVM IR module with error %s", ErrorMessage.c_str()); - auto CPUVal = ConfigHelper::get(); - auto FeaturesVal = ConfigHelper::get(); - llvm::StringRef CPU = CPUVal.begin(); - llvm::StringRef Features = FeaturesVal.begin(); - - auto *KernelFunc = KernelName ? Mod.getFunction(KernelName) : nullptr; - if (CPU.empty()) { - // Set to the lowest tested target according to the GetStartedGuide, section - // "Build DPC++ toolchain with support for HIP AMD" - CPU = "gfx906"; - if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) { - CPU = KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString(); - } - } - if (Features.empty()) { - if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) { - Features = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE) - .getValueAsString(); - } - } - + auto [CPU, Features] = + getTargetCPUAndFeatureAttrs(&Mod, KernelName, BinaryFormat::AMDGCN); std::unique_ptr TargetMachine(Target->createTargetMachine( - Mod.getTargetTriple(), CPU, Features, {}, llvm::Reloc::PIC_, std::nullopt, + Triple{TargetTriple}, CPU, Features, {}, llvm::Reloc::PIC_, std::nullopt, llvm::CodeGenOptLevel::Default)); std::string AMDObj; @@ -226,3 +180,34 @@ Translator::translateToAMDGCN(llvm::Module &Mod, JITContext &JITCtx, return &JITCtx.emplaceBinary(std::move(AMDObj), BinaryFormat::AMDGCN); #endif // JIT_SUPPORT_AMDGCN } + +std::pair Translator::getTargetCPUAndFeatureAttrs( + const llvm::Module *M, const char *KernelName, BinaryFormat Format) { + assert((Format == BinaryFormat::AMDGCN || Format == BinaryFormat::PTX) && + "Unexpected format found"); + static const char *TARGET_CPU_ATTRIBUTE = "target-cpu"; + static const char *TARGET_FEATURE_ATTRIBUTE = "target-features"; + // Give priority to user specified values (through environment variables: + // SYCL_JIT_AMDGCN_PTX_TARGET_CPU and SYCL_JIT_AMDGCN_PTX_TARGET_FEATURES). + auto *KernelFunc = (M && KernelName) ? M->getFunction(KernelName) : nullptr; + auto CPUVal = ConfigHelper::get(); + auto FeaturesVal = ConfigHelper::get(); + llvm::StringRef CPU = CPUVal.begin(); + llvm::StringRef Features = FeaturesVal.begin(); + if (CPU.empty()) { + // Set to the lowest tested target according to the GetStartedGuide, section + // "Build DPC++ toolchain with support for HIP AMD" + CPU = Format == BinaryFormat::AMDGCN ? "gfx90a" : "sm_50"; + if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) { + CPU = KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString(); + } + } + if (Features.empty()) { + Features = Format == BinaryFormat::PTX ? "+sm_50,+ptx76" : ""; + if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) { + Features = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE) + .getValueAsString(); + } + } + return std::make_pair(std::string{CPU}, std::string{Features}); +} diff --git a/sycl-jit/jit-compiler/lib/translation/Translation.h b/sycl-jit/jit-compiler/lib/translation/Translation.h index 9c8dfdda5f9f5..78bf4ef8ac416 100644 --- a/sycl-jit/jit-compiler/lib/translation/Translation.h +++ b/sycl-jit/jit-compiler/lib/translation/Translation.h @@ -12,7 +12,6 @@ #include "JITContext.h" #include "llvm/IR/Module.h" #include "llvm/Support/Error.h" -#include namespace jit_compiler { @@ -26,6 +25,9 @@ class Translator { translate(llvm::Module &Mod, JITContext &JITCtx, BinaryFormat Format, const char *KernelName = nullptr); + std::pair static getTargetCPUAndFeatureAttrs( + const llvm::Module *M, const char *KernelName, BinaryFormat Format); + private: /// Pair of address and size to represent a binary blob. using BinaryBlob = std::pair; diff --git a/sycl/source/detail/device_image_impl.hpp b/sycl/source/detail/device_image_impl.hpp index 83af5b246683a..7c9ee1787669e 100644 --- a/sycl/source/detail/device_image_impl.hpp +++ b/sycl/source/detail/device_image_impl.hpp @@ -8,6 +8,7 @@ #pragma once +#include "JITBinaryInfo.h" #include #include #include @@ -693,6 +694,22 @@ class device_image_impl { return MRTCBinInfo && MRTCBinInfo->MLanguage == Lang; } + static ::jit_compiler::BinaryFormat getTargetFormat(const backend Backend) { + switch (Backend) { + case backend::ext_oneapi_level_zero: + case backend::opencl: + return ::jit_compiler::BinaryFormat::SPIRV; + case backend::ext_oneapi_cuda: + return ::jit_compiler::BinaryFormat::PTX; + case backend::ext_oneapi_hip: + return ::jit_compiler::BinaryFormat::AMDGCN; + default: + throw sycl::exception( + sycl::make_error_code(sycl::errc::feature_not_supported), + "Backend unsupported by kernel fusion"); + } + } + std::vector> buildFromSource( const std::vector &Devices, const std::vector &BuildOptions, @@ -724,9 +741,12 @@ class device_image_impl { } } - if (MRTCBinInfo->MLanguage == syclex::source_language::sycl) + if (MRTCBinInfo->MLanguage == syclex::source_language::sycl) { + const auto Format = getTargetFormat(MContext.get_backend()); return createSYCLImages(Devices, bundle_state::executable, BuildOptions, - LogPtr, RegisteredKernelNames, OutDeviceBins); + LogPtr, RegisteredKernelNames, OutDeviceBins, + Format); + } std::vector DeviceVec; DeviceVec.reserve(Devices.size()); @@ -815,8 +835,10 @@ class device_image_impl { "device does not support source language"); } } + const auto Format = getTargetFormat(MContext.get_backend()); return createSYCLImages(Devices, bundle_state::object, CompileOptions, - LogPtr, RegisteredKernelNames, OutDeviceBins); + LogPtr, RegisteredKernelNames, OutDeviceBins, + Format); } private: @@ -989,8 +1011,8 @@ class device_image_impl { const std::vector &Options, std::string *LogPtr, const std::vector &RegisteredKernelNames, - std::vector> &OutDeviceBins) - const { + std::vector> &OutDeviceBins, + ::jit_compiler::BinaryFormat Format) const { assert(MRTCBinInfo); assert(MRTCBinInfo->MLanguage == syclex::source_language::sycl); assert(std::holds_alternative(MBinImage)); @@ -1020,7 +1042,7 @@ class device_image_impl { auto [Binaries, Prefix] = syclex::detail::SYCL_JIT_Compile( RegisteredKernelNames.empty() ? SourceStr : SourceExt.str(), - MRTCBinInfo->MIncludePairs, Options, LogPtr); + MRTCBinInfo->MIncludePairs, Options, LogPtr, Format); auto &PM = detail::ProgramManager::getInstance(); diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index 189bd3d145309..cf73828d5785b 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -263,6 +263,7 @@ sycl_device_binaries jit_compiler::createDeviceBinaries( Binary.addOffloadEntry(std::move(Entry)); } + bool FinalizationTagAdded = false; for (const auto &FPS : DevImgInfo.Properties) { bool IsDeviceGlobalsPropSet = FPS.Name == __SYCL_PROPERTY_SET_SYCL_DEVICE_GLOBALS; @@ -279,18 +280,50 @@ sycl_device_binaries jit_compiler::createDeviceBinaries( sycl_property_type::SYCL_PROPERTY_TYPE_BYTE_ARRAY}); } } + if (FPS.Name == __SYCL_PROPERTY_SET_PROGRAM_METADATA && + DevImgInfo.BinaryInfo.Format == + ::jit_compiler::BinaryFormat::AMDGCN) { + PropSet.addProperty(PropertyContainer{ + __SYCL_PROGRAM_METADATA_TAG_NEED_FINALIZATION, 1}); + FinalizationTagAdded = true; + } Binary.addProperty(std::move(PropSet)); Binary.setCompileOptions(BundleInfo.CompileOptions.c_str()); } - Collection->addDeviceBinary(std::move(Binary), - DevImgInfo.BinaryInfo.BinaryStart, - DevImgInfo.BinaryInfo.BinarySize, - (DevImgInfo.BinaryInfo.AddressBits == 64) - ? __SYCL_DEVICE_BINARY_TARGET_SPIRV64 - : __SYCL_DEVICE_BINARY_TARGET_SPIRV32, - SYCL_DEVICE_BINARY_TYPE_SPIRV); + auto BinaryTarget = ""; + auto Format = SYCL_DEVICE_BINARY_TYPE_NONE; + switch (DevImgInfo.BinaryInfo.Format) { + case ::jit_compiler::BinaryFormat::SPIRV: + BinaryTarget = DevImgInfo.BinaryInfo.AddressBits == 64 + ? __SYCL_DEVICE_BINARY_TARGET_SPIRV64 + : __SYCL_DEVICE_BINARY_TARGET_SPIRV32; + Format = SYCL_DEVICE_BINARY_TYPE_SPIRV; + break; + case ::jit_compiler::BinaryFormat::PTX: + BinaryTarget = __SYCL_DEVICE_BINARY_TARGET_LLVM_NVPTX64; + Format = SYCL_DEVICE_BINARY_TYPE_NONE; + break; + case ::jit_compiler::BinaryFormat::AMDGCN: { + BinaryTarget = __SYCL_DEVICE_BINARY_TARGET_LLVM_AMDGCN; + Format = SYCL_DEVICE_BINARY_TYPE_NONE; + // If the program had no properties, the tag needs to be added now. + if (!FinalizationTagAdded) { + PropertySetContainer ProgramMetadata{ + __SYCL_PROPERTY_SET_PROGRAM_METADATA}; + ProgramMetadata.addProperty(PropertyContainer{ + __SYCL_PROGRAM_METADATA_TAG_NEED_FINALIZATION, 1}); + Binary.addProperty(std::move(ProgramMetadata)); + } + break; + } + default: + assert(false && "Unsupported format"); + }; + Collection->addDeviceBinary( + std::move(Binary), DevImgInfo.BinaryInfo.BinaryStart, + DevImgInfo.BinaryInfo.BinarySize, BinaryTarget, Format); } sycl_device_binaries Binaries = Collection->getPIDeviceStruct(); @@ -311,7 +344,8 @@ void jit_compiler::destroyDeviceBinaries(sycl_device_binaries Binaries) { std::pair jit_compiler::compileSYCL( const std::string &CompilationID, const std::string &SYCLSource, const std::vector> &IncludePairs, - const std::vector &UserArgs, std::string *LogPtr) { + const std::vector &UserArgs, std::string *LogPtr, + ::jit_compiler::BinaryFormat Format) { auto appendToLog = [LogPtr](const char *Msg) { if (LogPtr) { LogPtr->append(Msg); @@ -339,7 +373,7 @@ std::pair jit_compiler::compileSYCL( std::vector CachedIR; if (PersistentDeviceCodeCache::isEnabled()) { auto Result = - CalculateHashHandle(SourceFile, IncludeFilesView, UserArgsView); + CalculateHashHandle(SourceFile, IncludeFilesView, UserArgsView, Format); if (Result.failed()) { appendToLog(Result.getPreprocLog()); @@ -349,8 +383,9 @@ std::pair jit_compiler::compileSYCL( } } - auto Result = CompileSYCLHandle(SourceFile, IncludeFilesView, UserArgsView, - CachedIR, /*SaveIR=*/!CacheKey.empty()); + auto Result = + CompileSYCLHandle(SourceFile, IncludeFilesView, UserArgsView, CachedIR, + /*SaveIR=*/!CacheKey.empty(), Format); const char *BuildLog = Result.getBuildLog(); appendToLog(BuildLog); diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp index 0f0a3f3f5738c..e0aefd1deae5d 100644 --- a/sycl/source/detail/jit_compiler.hpp +++ b/sycl/source/detail/jit_compiler.hpp @@ -8,6 +8,7 @@ #pragma once +#include #include #include #include @@ -45,7 +46,8 @@ class jit_compiler { std::pair compileSYCL( const std::string &CompilationID, const std::string &SYCLSource, const std::vector> &IncludePairs, - const std::vector &UserArgs, std::string *LogPtr); + const std::vector &UserArgs, std::string *LogPtr, + ::jit_compiler::BinaryFormat Format); void destroyDeviceBinaries(sycl_device_binaries Binaries); diff --git a/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.cpp b/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.cpp index 93ec94a8ac328..418842fdc48ec 100644 --- a/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.cpp +++ b/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.cpp @@ -44,7 +44,8 @@ std::pair SYCL_JIT_Compile( [[maybe_unused]] const std::string &SYCLSource, [[maybe_unused]] const include_pairs_t &IncludePairs, [[maybe_unused]] const std::vector &UserArgs, - [[maybe_unused]] std::string *LogPtr) { + [[maybe_unused]] std::string *LogPtr, + [[maybe_unused]] ::jit_compiler::BinaryFormat Format) { #if SYCL_EXT_JIT_ENABLE static std::atomic_uintptr_t CompilationCounter; std::string CompilationID = "rtc_" + std::to_string(CompilationCounter++); @@ -52,7 +53,7 @@ std::pair SYCL_JIT_Compile( for (const sycl::detail::string_view UserArg : UserArgs) UserArgStrings.push_back(UserArg.data()); return sycl::detail::jit_compiler::get_instance().compileSYCL( - CompilationID, SYCLSource, IncludePairs, UserArgStrings, LogPtr); + CompilationID, SYCLSource, IncludePairs, UserArgStrings, LogPtr, Format); #else throw sycl::exception(sycl::errc::build, "kernel_compiler via sycl-jit is not available"); diff --git a/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.hpp b/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.hpp index 047a10a061e1f..007dfae9aff22 100644 --- a/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.hpp +++ b/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.hpp @@ -8,6 +8,7 @@ #pragma once +#include #include #include // __SYCL_EXPORT #include @@ -35,7 +36,7 @@ userArgsAsString(const std::vector &UserArguments); std::pair SYCL_JIT_Compile(const std::string &Source, const include_pairs_t &IncludePairs, const std::vector &UserArgs, - std::string *LogPtr); + std::string *LogPtr, ::jit_compiler::BinaryFormat Format); void SYCL_JIT_Destroy(sycl_device_binaries Binaries); diff --git a/sycl/source/kernel_bundle.cpp b/sycl/source/kernel_bundle.cpp index ab12ff67e8590..c4c45bc04b517 100644 --- a/sycl/source/kernel_bundle.cpp +++ b/sycl/source/kernel_bundle.cpp @@ -406,8 +406,10 @@ bool is_source_kernel_bundle_supported( const std::vector &DeviceImplVec) { backend BE = DeviceImplVec[0]->getBackend(); // Support is limited to the opencl and level_zero backends. - bool BE_Acceptable = (BE == sycl::backend::ext_oneapi_level_zero) || - (BE == sycl::backend::opencl); + bool BE_Acceptable = + (BE == sycl::backend::ext_oneapi_level_zero) || + (BE == sycl::backend::opencl || BE == sycl::backend::ext_oneapi_hip || + BE == sycl::backend::ext_oneapi_cuda); if (!BE_Acceptable) return false; diff --git a/sycl/test-e2e/KernelCompiler/sycl.cpp b/sycl/test-e2e/KernelCompiler/sycl.cpp index fac041254778a..b8fcd01666f2b 100644 --- a/sycl/test-e2e/KernelCompiler/sycl.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl.cpp @@ -6,7 +6,6 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_device_allocations // UNSUPPORTED: accelerator @@ -111,15 +110,14 @@ auto constexpr DeviceLibrariesSource = R"===( #include #include #include -#include extern "C" SYCL_EXTERNAL SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(sycl::ext::oneapi::experimental::single_task_kernel) void device_libs_kernel(float *ptr) { // Extension list: llvm/lib/SYCLLowerIR/SYCLDeviceLibReqMask.cpp - // cl_intel_devicelib_assert is not available for opencl:gpu; skip testing it. - // Only test the fp32 variants of complex, math and imf to keep this test + // cl_intel_devicelib_assert is not available for opencl:gpu; skip testing + // it. Only test the fp32 variants of complex and math to keep this test // device-agnostic. // cl_intel_devicelib_math @@ -131,14 +129,8 @@ void device_libs_kernel(float *ptr) { // cl_intel_devicelib_cstring ptr[2] = memcmp(ptr + 2, ptr + 2, sizeof(float)); - // cl_intel_devicelib_imf - ptr[3] = sycl::ext::intel::math::sqrt(ptr[3] * 2); - - // cl_intel_devicelib_imf_bf16 - ptr[4] = sycl::ext::intel::math::float2bfloat16(ptr[4] * 0.5f); - // cl_intel_devicelib_bfloat16 - ptr[5] = sycl::ext::oneapi::bfloat16{ptr[5] / 0.25f}; + ptr[3] = sycl::ext::oneapi::bfloat16{ptr[3] / 0.25f}; } )==="; @@ -359,7 +351,7 @@ int test_device_libraries(sycl::queue q) { exe_kb kbExe = syclex::build(kbSrc); sycl::kernel k = kbExe.ext_oneapi_get_kernel("device_libs_kernel"); - constexpr size_t nElem = 6; + constexpr size_t nElem = 4; float *ptr = sycl::malloc_shared(nElem, q); for (int i = 0; i < nElem; ++i) ptr[i] = 1.0f; diff --git a/sycl/test-e2e/KernelCompiler/sycl_imf.cpp b/sycl/test-e2e/KernelCompiler/sycl_imf.cpp new file mode 100644 index 0000000000000..ce78ccff1d5d0 --- /dev/null +++ b/sycl/test-e2e/KernelCompiler/sycl_imf.cpp @@ -0,0 +1,80 @@ +//==--- sycl.cpp --- kernel_compiler extension imf tests -------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: aspect-usm_device_allocations +// REQUIRES: (opencl || level_zero) + +// UNSUPPORTED: accelerator +// UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there. + +// RUN: %{build} -o %t.out +// RUN: %{l0_leak_check} %{run} %t.out + +#include +#include +#include + +auto constexpr IMFSources = R"===( +#include +#include +#include + +extern "C" SYCL_EXTERNAL +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(sycl::ext::oneapi::experimental::single_task_kernel) +void imf_kernel(float *ptr) { + // cl_intel_devicelib_imf + //ptr[0] = sycl::ext::intel::math::sqrt(ptr[0] * 2); + + // cl_intel_devicelib_imf_bf16 + //ptr[1] = sycl::ext::intel::math::float2bfloat16(ptr[1] * 0.5f); +} +)==="; + +int main() { +#ifdef SYCL_EXT_ONEAPI_KERNEL_COMPILER + namespace syclex = sycl::ext::oneapi::experimental; + using source_kb = sycl::kernel_bundle; + using exe_kb = sycl::kernel_bundle; + + sycl::context ctx = q.get_context(); + + source_kb kbSrc = syclex::create_kernel_bundle_from_source( + ctx, syclex::source_language::sycl, IMFSources); + exe_kb kbExe = syclex::build(kbSrc); + + sycl::kernel k = kbExe.ext_oneapi_get_kernel("imf_kernel"); + constexpr size_t nElem = 2; + float *ptr = sycl::malloc_shared(nElem, q); + for (int i = 0; i < nElem; ++i) + ptr[i] = 1.0f; + + q.submit([&](sycl::handler &cgh) { + cgh.set_arg(0, ptr); + cgh.single_task(k); + }); + q.wait_and_throw(); + + // Check that the kernel was executed. Given the {1.0, ..., 1.0} input, + // the expected result is approximately {1.41, 0.5}. + for (unsigned i = 0; i < nElem; ++i) { + std::cout << ptr[i] << ' '; + assert(ptr[i] != 1.0f); + } + std::cout << std::endl; + + sycl::free(ptr, q); + + bool ok = q.get_device().ext_oneapi_can_build(syclex::source_language::sycl); + if (!ok) { + return -1; + } +#else + static_assert(false, "Kernel Compiler feature test macro undefined"); +#endif + return 0; +} From 50d8feac064303acc028a7df21b1494b301e3968 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Thu, 12 Jun 2025 07:26:43 +0000 Subject: [PATCH 02/26] PR feedback --- sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp | 12 +++++------- sycl/source/detail/device_image_impl.hpp | 4 ++-- sycl/test-e2e/KernelCompiler/sycl_imf.cpp | 3 ++- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp index de94eeff2afe8..5d21986e3945a 100644 --- a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp +++ b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp @@ -376,7 +376,6 @@ static void setupTool(ClangTool &Tool, const std::string &DPCPPRoot, static void setGPUTarget(BinaryFormat Format, SmallVector &CommandLine) { auto [CPU, _] = Translator::getTargetCPUAndFeatureAttrs(nullptr, "", Format); - CommandLine.push_back("-fsycl"); if (Format == BinaryFormat::PTX) { CommandLine.push_back("-fsycl-targets=nvptx64-nvidia-cuda"); CommandLine.push_back("-Xsycl-target-backend"); @@ -603,9 +602,9 @@ Error jit_compiler::linkDeviceLibraries(llvm::Module &Module, return createStringError("Could not determine list of device libraries: %s", BuildLog.c_str()); } - const bool IsGPUTarget = + const bool IsCudaHIP = Format == BinaryFormat::PTX || Format == BinaryFormat::AMDGCN; - if (IsGPUTarget) { + if (IsCudaHIP) { // Based on the OS and the format decide on the version of libspirv. // NOTE: this will be problematic if cross-compiling between OSes. std::string Libclc{"clc/"}; @@ -638,7 +637,7 @@ Error jit_compiler::linkDeviceLibraries(llvm::Module &Module, } // For GPU targets we need to link against vendor provided libdevice. - if (IsGPUTarget) { + if (IsCudaHIP) { Triple T{Module.getTargetTriple()}; Driver D{(Twine(DPCPPRoot) + "/bin/clang++").str(), T.getTriple(), Diags}; auto [CPU, _] = @@ -680,10 +679,9 @@ Error jit_compiler::linkDeviceLibraries(llvm::Module &Module, } #endif for (auto &LibDeviceFile : LibDeviceFiles) { - auto Res = LinkInLib(LibDeviceFile); // llvm::Error converts to false on success. - if (Res) { - return Res; + if (auto Error = LinkInLib(LibDeviceFile)) { + return Error; } } } diff --git a/sycl/source/detail/device_image_impl.hpp b/sycl/source/detail/device_image_impl.hpp index 7c9ee1787669e..4217ecc24288a 100644 --- a/sycl/source/detail/device_image_impl.hpp +++ b/sycl/source/detail/device_image_impl.hpp @@ -705,8 +705,8 @@ class device_image_impl { return ::jit_compiler::BinaryFormat::AMDGCN; default: throw sycl::exception( - sycl::make_error_code(sycl::errc::feature_not_supported), - "Backend unsupported by kernel fusion"); + sycl::make_error_code(sycl::errc::invalid), + "Backend does not support kernel_compiler extension"); } } diff --git a/sycl/test-e2e/KernelCompiler/sycl_imf.cpp b/sycl/test-e2e/KernelCompiler/sycl_imf.cpp index ce78ccff1d5d0..5393b0bbbda86 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_imf.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_imf.cpp @@ -1,4 +1,4 @@ -//==--- sycl.cpp --- kernel_compiler extension imf tests -------------------==// +//==--- sycl_imf.cpp --- kernel_compiler extension imf tests ---------------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -37,6 +37,7 @@ void imf_kernel(float *ptr) { int main() { #ifdef SYCL_EXT_ONEAPI_KERNEL_COMPILER + sycl::queue q; namespace syclex = sycl::ext::oneapi::experimental; using source_kb = sycl::kernel_bundle; using exe_kb = sycl::kernel_bundle; From e7f6349c4a6f7bf714362f9c9327797af95c2fbc Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Thu, 12 Jun 2025 04:10:46 -0400 Subject: [PATCH 03/26] Bump sycl.hpp count - note, this include comes from RTC source --- sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp b/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp index c913f0d6bcaa6..c0b233756346d 100644 --- a/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp +++ b/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp @@ -6,7 +6,7 @@ // CHECK-DAG: README.md // CHECK-DAG: lit.cfg.py // -// CHECK-NUM-MATCHES: 20 +// CHECK-NUM-MATCHES: 21 // // This test verifies that `` isn't used in E2E tests. Instead, // fine-grained includes should used, see From f2f861d4dd69addde88e5f859afba723510c9151 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Sat, 14 Jun 2025 04:01:58 -0400 Subject: [PATCH 04/26] PR feedback 2 * sycl_imf fix - commented kernel * move away from ifdefs * don't use CLI strings for args in setting up the GPU targets --- .../lib/rtc/DeviceCompilation.cpp | 96 +++++++++---------- sycl/test-e2e/KernelCompiler/sycl_imf.cpp | 6 +- 2 files changed, 47 insertions(+), 55 deletions(-) diff --git a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp index 5d21986e3945a..2efe85e727931 100644 --- a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp +++ b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp @@ -11,10 +11,14 @@ #include "JITBinaryInfo.h" #include "translation/Translation.h" +#include +#include +#include #include #include #include #include +#include #include #include #include @@ -24,15 +28,6 @@ #include #include #include -#if defined(JIT_SUPPORT_PTX) || defined(JIT_SUPPORT_AMDGCN) -#include -#endif -#ifdef JIT_SUPPORT_PTX -#include -#include -#elif JIT_SUPPORT_AMDGCN -#include -#endif #include #include @@ -324,7 +319,7 @@ class LLVMDiagnosticWrapper : public llvm::DiagnosticHandler { } // anonymous namespace static void adjustArgs(const InputArgList &UserArgList, - const std::string &DPCPPRoot, + const std::string &DPCPPRoot, BinaryFormat Format, SmallVectorImpl &CommandLine) { DerivedArgList DAL{UserArgList}; const auto &OptTable = getDriverOptTable(); @@ -337,6 +332,22 @@ static void adjustArgs(const InputArgList &UserArgList, // unused argument warning. DAL.AddFlagArg(nullptr, OptTable.getOption(OPT_Qunused_arguments)); + if (Format == BinaryFormat::PTX || Format == BinaryFormat::AMDGCN) { + auto [CPU, _] = + Translator::getTargetCPUAndFeatureAttrs(nullptr, "", Format); + if (Format == BinaryFormat::AMDGCN) { + DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_fsycl_targets_EQ), + "amdgcn-amd-amdhsa"); + DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_Xsycl_backend_EQ), + "amdgcn-amd-amdhsa"); + DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_offload_arch_EQ), CPU); + } else { + DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_fsycl_targets_EQ), + "nvptx64-nvidia-cuda"); + DAL.AddFlagArg(nullptr, OptTable.getOption(OPT_Xsycl_backend)); + DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_cuda_gpu_arch_EQ), CPU); + } + } ArgStringList ASL; for_each(DAL, [&DAL, &ASL](Arg *A) { A->render(DAL, ASL); }); for_each(UserArgList, @@ -373,20 +384,6 @@ static void setupTool(ClangTool &Tool, const std::string &DPCPPRoot, }); } -static void setGPUTarget(BinaryFormat Format, - SmallVector &CommandLine) { - auto [CPU, _] = Translator::getTargetCPUAndFeatureAttrs(nullptr, "", Format); - if (Format == BinaryFormat::PTX) { - CommandLine.push_back("-fsycl-targets=nvptx64-nvidia-cuda"); - CommandLine.push_back("-Xsycl-target-backend"); - CommandLine.push_back("--cuda-gpu-arch=" + CPU); - } else if (Format == BinaryFormat::AMDGCN) { - CommandLine.push_back("-fsycl-targets=amdgcn-amd-amdhsa"); - CommandLine.push_back("-Xsycl-target-backend=amdgcn-amd-amdhsa"); - CommandLine.push_back("--offload-arch=" + CPU); - } -} - Expected jit_compiler::calculateHash( InMemoryFile SourceFile, View IncludeFiles, const InputArgList &UserArgList, BinaryFormat Format) { @@ -398,10 +395,7 @@ Expected jit_compiler::calculateHash( } SmallVector CommandLine; - if (Format == BinaryFormat::PTX || Format == BinaryFormat::AMDGCN) { - setGPUTarget(Format, CommandLine); - } - adjustArgs(UserArgList, DPCPPRoot, CommandLine); + adjustArgs(UserArgList, DPCPPRoot, Format, CommandLine); FixedCompilationDatabase DB{".", CommandLine}; ClangTool Tool{DB, {SourceFile.Path}}; @@ -439,10 +433,7 @@ Expected jit_compiler::compileDeviceCode( } SmallVector CommandLine; - if (Format == BinaryFormat::PTX || Format == BinaryFormat::AMDGCN) { - setGPUTarget(Format, CommandLine); - } - adjustArgs(UserArgList, DPCPPRoot, CommandLine); + adjustArgs(UserArgList, DPCPPRoot, Format, CommandLine); FixedCompilationDatabase DB{".", CommandLine}; ClangTool Tool{DB, {SourceFile.Path}}; @@ -657,27 +648,28 @@ Error jit_compiler::linkDeviceLibraries(llvm::Module &Module, return Error::success(); }; SmallVector LibDeviceFiles; -#ifdef JIT_SUPPORT_PTX - // For NVPTX we can get away with CudaInstallationDetector. - LazyDetector CudaInstallation{D, T, UserArgList}; - auto LibDevice = CudaInstallation->getLibDeviceFile(CPU); - if (LibDevice.empty()) { - return createStringError("Unable to find Cuda libdevice"); - } - LibDeviceFiles.push_back(LibDevice); -#elif JIT_SUPPORT_AMDGCN - // AMDGPU requires entire toolchain in order to provide all common bitcode - // libraries. - clang::driver::toolchains::ROCMToolChain TC(D, T, UserArgList); - auto CommonDeviceLibs = TC.getCommonDeviceLibNames( - UserArgList, CPU, Action::OffloadKind::OFK_SYCL, false); - if (CommonDeviceLibs.empty()) { - return createStringError("Unable to find ROCm common device libraries"); - } - for (auto &Lib : CommonDeviceLibs) { - LibDeviceFiles.push_back(Lib.Path); + if (Format == BinaryFormat::PTX) { + // For NVPTX we can get away with CudaInstallationDetector. + LazyDetector CudaInstallation{D, T, + UserArgList}; + auto LibDevice = CudaInstallation->getLibDeviceFile(CPU); + if (LibDevice.empty()) { + return createStringError("Unable to find Cuda libdevice"); + } + LibDeviceFiles.push_back(LibDevice); + } else { + // AMDGPU requires entire toolchain in order to provide all common bitcode + // libraries. + clang::driver::toolchains::ROCMToolChain TC(D, T, UserArgList); + auto CommonDeviceLibs = TC.getCommonDeviceLibNames( + UserArgList, CPU, Action::OffloadKind::OFK_SYCL, false); + if (CommonDeviceLibs.empty()) { + return createStringError("Unable to find ROCm common device libraries"); + } + for (auto &Lib : CommonDeviceLibs) { + LibDeviceFiles.push_back(Lib.Path); + } } -#endif for (auto &LibDeviceFile : LibDeviceFiles) { // llvm::Error converts to false on success. if (auto Error = LinkInLib(LibDeviceFile)) { diff --git a/sycl/test-e2e/KernelCompiler/sycl_imf.cpp b/sycl/test-e2e/KernelCompiler/sycl_imf.cpp index 5393b0bbbda86..55f4b6b788e10 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_imf.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_imf.cpp @@ -28,10 +28,10 @@ extern "C" SYCL_EXTERNAL SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(sycl::ext::oneapi::experimental::single_task_kernel) void imf_kernel(float *ptr) { // cl_intel_devicelib_imf - //ptr[0] = sycl::ext::intel::math::sqrt(ptr[0] * 2); + ptr[0] = sycl::ext::intel::math::sqrt(ptr[0] * 2); // cl_intel_devicelib_imf_bf16 - //ptr[1] = sycl::ext::intel::math::float2bfloat16(ptr[1] * 0.5f); + ptr[1] = sycl::ext::intel::math::float2bfloat16(ptr[1] * 0.5f); } )==="; @@ -60,7 +60,7 @@ int main() { }); q.wait_and_throw(); - // Check that the kernel was executed. Given the {1.0, ..., 1.0} input, + // Check that the kernel was executed. Given the {1.0, 1.0} input, // the expected result is approximately {1.41, 0.5}. for (unsigned i = 0; i < nElem; ++i) { std::cout << ptr[i] << ' '; From d750eaf44ac80a21e0f6d0c8cc0ff47cd12e96ee Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Mon, 16 Jun 2025 08:24:53 -0400 Subject: [PATCH 05/26] Fix handling of CPU/Features --- sycl/source/detail/jit_compiler.cpp | 18 ++++++++++++++++++ sycl/test-e2e/KernelCompiler/sycl.cpp | 2 +- sycl/test-e2e/KernelCompiler/sycl_imf.cpp | 5 ----- sycl/test-e2e/format.py | 1 + 4 files changed, 20 insertions(+), 6 deletions(-) diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index cf73828d5785b..2884e25399e5f 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -346,6 +346,24 @@ std::pair jit_compiler::compileSYCL( const std::vector> &IncludePairs, const std::vector &UserArgs, std::string *LogPtr, ::jit_compiler::BinaryFormat Format) { + if (Format == ::jit_compiler::BinaryFormat::PTX || + Format == ::jit_compiler::BinaryFormat::AMDGCN) { + // If present, set-up the config with env variables describing CPU and + // features. + auto SetUpOption = [](const std::string &Value) { + ::jit_compiler::JITEnvVar Option(Value.begin(), Value.end()); + return Option; + }; + ::jit_compiler::JITEnvVar TargetCPUOpt = SetUpOption( + detail::SYCLConfig::get()); + this->AddToConfigHandle( + ::jit_compiler::option::JITTargetCPU::set(TargetCPUOpt)); + ::jit_compiler::JITEnvVar TargetFeaturesOpt = SetUpOption( + detail::SYCLConfig::get()); + this->AddToConfigHandle( + ::jit_compiler::option::JITTargetFeatures::set(TargetFeaturesOpt)); + } + auto appendToLog = [LogPtr](const char *Msg) { if (LogPtr) { LogPtr->append(Msg); diff --git a/sycl/test-e2e/KernelCompiler/sycl.cpp b/sycl/test-e2e/KernelCompiler/sycl.cpp index b8fcd01666f2b..ccd5941d0ee4b 100644 --- a/sycl/test-e2e/KernelCompiler/sycl.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl.cpp @@ -12,7 +12,7 @@ // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there. // RUN: %{build} -o %t.out -// RUN: %{l0_leak_check} %{run} %t.out +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch}; %} %{l0_leak_check} %{run} %t.out // XFAIL: preview-mode && run-mode // XFAIL-TRACKER: https://github.com/intel/llvm/issues/18390 diff --git a/sycl/test-e2e/KernelCompiler/sycl_imf.cpp b/sycl/test-e2e/KernelCompiler/sycl_imf.cpp index 55f4b6b788e10..363728b318580 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_imf.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_imf.cpp @@ -69,11 +69,6 @@ int main() { std::cout << std::endl; sycl::free(ptr, q); - - bool ok = q.get_device().ext_oneapi_can_build(syclex::source_language::sycl); - if (!ok) { - return -1; - } #else static_assert(false, "Kernel Compiler feature test macro undefined"); #endif diff --git a/sycl/test-e2e/format.py b/sycl/test-e2e/format.py index 44da0ef1c52c4..ff2f8e36ecfb7 100644 --- a/sycl/test-e2e/format.py +++ b/sycl/test-e2e/format.py @@ -244,6 +244,7 @@ def execute(self, test, litConfig): ) sycl_target_opts += hip_arch_opts substitutions.append(("%{hip_arch_opts}", hip_arch_opts)) + substitutions.append(("%{amd_arch}", test.config.amd_arch)) if ( "target-spir" in build_targets and "spirv-backend" in test.config.available_features From 21ca5528e56f965e5b97b09386d6148f617a2822 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Tue, 17 Jun 2025 06:00:51 -0400 Subject: [PATCH 06/26] No need for semicolon in env --- sycl/test-e2e/KernelCompiler/sycl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/KernelCompiler/sycl.cpp b/sycl/test-e2e/KernelCompiler/sycl.cpp index ccd5941d0ee4b..10d4fe69b839d 100644 --- a/sycl/test-e2e/KernelCompiler/sycl.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl.cpp @@ -12,7 +12,7 @@ // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there. // RUN: %{build} -o %t.out -// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch}; %} %{l0_leak_check} %{run} %t.out +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out // XFAIL: preview-mode && run-mode // XFAIL-TRACKER: https://github.com/intel/llvm/issues/18390 From 9a0d655c7c9940ff43854394df101641c33c1d52 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Wed, 18 Jun 2025 12:21:11 +0000 Subject: [PATCH 07/26] Enable more RTC tests --- sycl/test-e2e/KernelCompiler/sycl_basic.cpp | 1 - sycl/test-e2e/KernelCompiler/sycl_cache.cpp | 10 +++++++--- sycl/test-e2e/KernelCompiler/sycl_cache_pm.cpp | 5 ++--- sycl/test-e2e/KernelCompiler/sycl_context_error.cpp | 2 -- sycl/test-e2e/KernelCompiler/sycl_device_flags.cpp | 2 +- sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp | 1 - .../KernelCompiler/sycl_export_registration.cpp | 2 ++ sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp | 2 -- sycl/test-e2e/KernelCompiler/sycl_join.cpp | 1 - sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp | 6 +++--- sycl/test-e2e/KernelCompiler/sycl_link.cpp | 2 ++ sycl/test-e2e/KernelCompiler/sycl_link_common_dep.cpp | 2 ++ .../sycl_link_common_dep_optional_feature.cpp | 2 ++ .../KernelCompiler/sycl_link_export_conflict.cpp | 2 ++ .../KernelCompiler/sycl_link_kernel_conflict.cpp | 2 ++ sycl/test-e2e/KernelCompiler/sycl_namespaces.cpp | 1 - sycl/test-e2e/KernelCompiler/sycl_overload.cpp | 1 - sycl/test-e2e/KernelCompiler/sycl_time_trace.cpp | 1 - 18 files changed, 25 insertions(+), 20 deletions(-) diff --git a/sycl/test-e2e/KernelCompiler/sycl_basic.cpp b/sycl/test-e2e/KernelCompiler/sycl_basic.cpp index 5c212ee0d843d..750cd5bc5f3ac 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_basic.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_basic.cpp @@ -6,7 +6,6 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_device_allocations // UNSUPPORTED: accelerator diff --git a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp index 1ec2f744b632d..09225158f692f 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp @@ -6,7 +6,6 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_device_allocations // UNSUPPORTED: accelerator @@ -17,8 +16,13 @@ // RUN: %{build} -o %t.out // RUN: %{run-aux} rm -rf %t/cache_dir // RUN: %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-UNLIM -// RUN: %{run-aux} rm -rf %t/cache_dir -// RUN: %{cache_vars} %{max_cache_size} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-EVICT + +// Eviction mechanism is based on the size of compiled kernels, which in turns +// depends on the target. Don't run eviction check for CUDA/HIP, so that we +// don't have to find a magic number that works for all binaries (and by +// definition is flaky). +// RUN: %{run} %if !(hip || cuda) %{ %{run-aux} rm -rf %t/cache_dir %} +// RUN: %{run} %if !(hip || cuda) %{ %{cache_vars} %{max_cache_size} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-EVICT %} #include #include diff --git a/sycl/test-e2e/KernelCompiler/sycl_cache_pm.cpp b/sycl/test-e2e/KernelCompiler/sycl_cache_pm.cpp index c449c25647414..48f111cbaad7a 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_cache_pm.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_cache_pm.cpp @@ -6,7 +6,6 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_device_allocations // UNSUPPORTED: accelerator @@ -25,11 +24,11 @@ // CHECK-WRITTEN-TO-CACHE: [Persistent Cache]: enabled // CHECK-WRITTEN-TO-CACHE-NOT: [Persistent Cache]: using cached device binary -// CHECK-WRITTEN-TO-CACHE: [Persistent Cache]: device binary has been cached +// CHECK-WRITTEN-TO-CACHE: {{\[Persistent Cache\]: device binary has been cached|\[kernel_compiler Persistent Cache\]: storing device code IR}} // CHECK-READ-FROM-CACHE: [Persistent Cache]: enabled // CHECK-READ-FROM-CACHE-NOT: [Persistent Cache]: device binary has been cached -// CHECK-READ-FROM-CACHE: [Persistent Cache]: using cached device binary +// CHECK-READ-FROM-CACHE: {{\[kernel_compiler Persistent Cache\]: using cached device code IR|\[Persistent Cache\]: using cached device binary}} #include #include diff --git a/sycl/test-e2e/KernelCompiler/sycl_context_error.cpp b/sycl/test-e2e/KernelCompiler/sycl_context_error.cpp index d111e77960e48..5392be5636b8d 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_context_error.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_context_error.cpp @@ -6,8 +6,6 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) - // UNSUPPORTED: accelerator // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there. diff --git a/sycl/test-e2e/KernelCompiler/sycl_device_flags.cpp b/sycl/test-e2e/KernelCompiler/sycl_device_flags.cpp index 93aa4eb9712c6..7bc78ceabefd8 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_device_flags.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_device_flags.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: level_zero +// REQUIRES: (level_zero || cuda || amd) // REQUIRES: aspect-usm_device_allocations // UNSUPPORTED: windows diff --git a/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp b/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp index ceda7252369e5..40f092acc9605 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp @@ -6,7 +6,6 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_device_allocations // UNSUPPORTED: accelerator, opencl && gpu diff --git a/sycl/test-e2e/KernelCompiler/sycl_export_registration.cpp b/sycl/test-e2e/KernelCompiler/sycl_export_registration.cpp index 789bab6223546..e8390ced06efb 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_export_registration.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_export_registration.cpp @@ -11,6 +11,8 @@ // UNSUPPORTED: accelerator // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there. +// Note linking is not supported on CUDA/HIP. + // -- Test for a case where a kernel bundle is built that exports a symbol and // -- other kernel bundles that uses it are compiled/linked without it. These // -- cases should fail due to unresolved symbols, rather than picking up the diff --git a/sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp b/sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp index bf0d46b5800be..62b42c8a6dd26 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp @@ -6,9 +6,7 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_device_allocations - // UNSUPPORTED: accelerator // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there. diff --git a/sycl/test-e2e/KernelCompiler/sycl_join.cpp b/sycl/test-e2e/KernelCompiler/sycl_join.cpp index 4ec0ec5c35604..26a1dba1a61d2 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_join.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_join.cpp @@ -6,7 +6,6 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_shared_allocations // UNSUPPORTED: accelerator diff --git a/sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp b/sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp index 4ef1cb2d88e9c..b3d7ef1bd6c1a 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp @@ -6,7 +6,6 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_device_allocations // UNSUPPORTED: accelerator @@ -51,14 +50,15 @@ int test_lifetimes() { ctx, syclex::source_language::sycl, SYCLSource); exe_kb kbExe1 = syclex::build(kbSrc); - // CHECK: urProgramCreateWithIL{{.*}}phProgram{{.*}}([[PROG1:.*]])) + // Cuda/Hip programs will be cratew with Binary, spirv IL. + // CHECK: urProgramCreateWith{{IL|Binary}}{{.*}}phProgram{{.*}}([[PROG1:.*]])) { std::cout << "Scope1\n"; // CHECK: Scope1 exe_kb kbExe2 = syclex::build(kbSrc); // kbExe2 goes out of scope; its kernels are removed from program mananager. - // CHECK: urProgramCreateWithIL{{.*}}phProgram{{.*}}([[PROG2:.*]])) + // CHECK: urProgramCreateWith{{IL|Binary}}{{.*}}phProgram{{.*}}([[PROG2:.*]])) // CHECK: urProgramRelease{{.*}}[[PROG2]] } std::cout << "End Scope1\n"; diff --git a/sycl/test-e2e/KernelCompiler/sycl_link.cpp b/sycl/test-e2e/KernelCompiler/sycl_link.cpp index 9ff9878e387ee..17aedb9685360 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_link.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_link.cpp @@ -15,6 +15,8 @@ // -- Test for a simple linking case with source files compiled from SYCL source // -- at runtime. +// Note linking is not supported on CUDA/HIP. + // RUN: %{build} -o %t.out // RUN: %{run} %t.out // RUN: %{l0_leak_check} %{run} %t.out diff --git a/sycl/test-e2e/KernelCompiler/sycl_link_common_dep.cpp b/sycl/test-e2e/KernelCompiler/sycl_link_common_dep.cpp index 49d67205b8cd1..ebe9e1d8ab56e 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_link_common_dep.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_link_common_dep.cpp @@ -12,6 +12,8 @@ // UNSUPPORTED: accelerator // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there. +// Note linking is not supported on CUDA/HIP. + // -- Test for linking where two kernels use the same imported symbols. // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/KernelCompiler/sycl_link_common_dep_optional_feature.cpp b/sycl/test-e2e/KernelCompiler/sycl_link_common_dep_optional_feature.cpp index 76fe85ce72fa9..dbc31e63da125 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_link_common_dep_optional_feature.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_link_common_dep_optional_feature.cpp @@ -12,6 +12,8 @@ // UNSUPPORTED: accelerator // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there. +// Note linking is not supported on CUDA/HIP. + // -- Test for linking where two kernels use the same imported symbols, but one // -- may not be supported on the device. diff --git a/sycl/test-e2e/KernelCompiler/sycl_link_export_conflict.cpp b/sycl/test-e2e/KernelCompiler/sycl_link_export_conflict.cpp index f9d6381e08b38..465ce7b5136a4 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_link_export_conflict.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_link_export_conflict.cpp @@ -12,6 +12,8 @@ // UNSUPPORTED: accelerator // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there. +// Note linking is not supported on CUDA/HIP. + // -- Test for a case where a kernel bundle with an exported symbol is compiled // -- before another kernel bundle using a different variant of the symbol. diff --git a/sycl/test-e2e/KernelCompiler/sycl_link_kernel_conflict.cpp b/sycl/test-e2e/KernelCompiler/sycl_link_kernel_conflict.cpp index 7fe9b0fd9db79..5b2a0e41b33b3 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_link_kernel_conflict.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_link_kernel_conflict.cpp @@ -12,6 +12,8 @@ // UNSUPPORTED: accelerator // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there. +// Note linking is not supported on CUDA/HIP. + // -- Test for the linking of two kernels with conflicting definitions of // -- kernels with the same name. diff --git a/sycl/test-e2e/KernelCompiler/sycl_namespaces.cpp b/sycl/test-e2e/KernelCompiler/sycl_namespaces.cpp index fbe2a75daf617..6c32f9ac95afb 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_namespaces.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_namespaces.cpp @@ -6,7 +6,6 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_device_allocations // UNSUPPORTED: accelerator diff --git a/sycl/test-e2e/KernelCompiler/sycl_overload.cpp b/sycl/test-e2e/KernelCompiler/sycl_overload.cpp index 97488d880993b..31553e4dc19c7 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_overload.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_overload.cpp @@ -6,7 +6,6 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_device_allocations // UNSUPPORTED: accelerator diff --git a/sycl/test-e2e/KernelCompiler/sycl_time_trace.cpp b/sycl/test-e2e/KernelCompiler/sycl_time_trace.cpp index 77a2d89bed123..c156f4c05b31a 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_time_trace.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_time_trace.cpp @@ -6,7 +6,6 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) // UNSUPPORTED: accelerator // UNSUPPORTED-INTENDED: SYCL-RTC is not available for accelerator devices From 702fbc1b6c4bac768cbd892913bda357f30459c4 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Wed, 18 Jun 2025 12:56:27 +0000 Subject: [PATCH 08/26] typo and device flags --- sycl/test-e2e/KernelCompiler/sycl_device_flags.cpp | 2 +- sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/test-e2e/KernelCompiler/sycl_device_flags.cpp b/sycl/test-e2e/KernelCompiler/sycl_device_flags.cpp index 7bc78ceabefd8..93aa4eb9712c6 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_device_flags.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_device_flags.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (level_zero || cuda || amd) +// REQUIRES: level_zero // REQUIRES: aspect-usm_device_allocations // UNSUPPORTED: windows diff --git a/sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp b/sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp index b3d7ef1bd6c1a..4fe75db63356b 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp @@ -50,7 +50,7 @@ int test_lifetimes() { ctx, syclex::source_language::sycl, SYCLSource); exe_kb kbExe1 = syclex::build(kbSrc); - // Cuda/Hip programs will be cratew with Binary, spirv IL. + // Cuda/Hip programs will be created with Binary, spirv IL. // CHECK: urProgramCreateWith{{IL|Binary}}{{.*}}phProgram{{.*}}([[PROG1:.*]])) { From 92bb53db4cf5fe1226d84b1dfe607d5614643f4f Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Wed, 18 Jun 2025 13:05:21 +0000 Subject: [PATCH 09/26] AMD arch substitution --- sycl/test-e2e/KernelCompiler/sycl_basic.cpp | 2 +- sycl/test-e2e/KernelCompiler/sycl_cache.cpp | 2 +- sycl/test-e2e/KernelCompiler/sycl_cache_pm.cpp | 5 +++-- sycl/test-e2e/KernelCompiler/sycl_context_error.cpp | 2 +- sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp | 2 +- sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp | 2 +- sycl/test-e2e/KernelCompiler/sycl_join.cpp | 4 ++-- sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp | 2 +- sycl/test-e2e/KernelCompiler/sycl_namespaces.cpp | 2 +- sycl/test-e2e/KernelCompiler/sycl_overload.cpp | 2 +- sycl/test-e2e/KernelCompiler/sycl_time_trace.cpp | 2 +- 11 files changed, 14 insertions(+), 13 deletions(-) diff --git a/sycl/test-e2e/KernelCompiler/sycl_basic.cpp b/sycl/test-e2e/KernelCompiler/sycl_basic.cpp index 750cd5bc5f3ac..033070acca675 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_basic.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_basic.cpp @@ -12,7 +12,7 @@ // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there. // RUN: %{build} -o %t.out -// RUN: %{l0_leak_check} %{run} %t.out +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out #include #include diff --git a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp index 09225158f692f..04c4b910b57fd 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp @@ -15,7 +15,7 @@ // DEFINE: %{max_cache_size} = SYCL_CACHE_MAX_SIZE=30000 // RUN: %{build} -o %t.out // RUN: %{run-aux} rm -rf %t/cache_dir -// RUN: %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-UNLIM +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-UNLIM // Eviction mechanism is based on the size of compiled kernels, which in turns // depends on the target. Don't run eviction check for CUDA/HIP, so that we diff --git a/sycl/test-e2e/KernelCompiler/sycl_cache_pm.cpp b/sycl/test-e2e/KernelCompiler/sycl_cache_pm.cpp index 48f111cbaad7a..c449c25647414 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_cache_pm.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_cache_pm.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +// REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_device_allocations // UNSUPPORTED: accelerator @@ -24,11 +25,11 @@ // CHECK-WRITTEN-TO-CACHE: [Persistent Cache]: enabled // CHECK-WRITTEN-TO-CACHE-NOT: [Persistent Cache]: using cached device binary -// CHECK-WRITTEN-TO-CACHE: {{\[Persistent Cache\]: device binary has been cached|\[kernel_compiler Persistent Cache\]: storing device code IR}} +// CHECK-WRITTEN-TO-CACHE: [Persistent Cache]: device binary has been cached // CHECK-READ-FROM-CACHE: [Persistent Cache]: enabled // CHECK-READ-FROM-CACHE-NOT: [Persistent Cache]: device binary has been cached -// CHECK-READ-FROM-CACHE: {{\[kernel_compiler Persistent Cache\]: using cached device code IR|\[Persistent Cache\]: using cached device binary}} +// CHECK-READ-FROM-CACHE: [Persistent Cache]: using cached device binary #include #include diff --git a/sycl/test-e2e/KernelCompiler/sycl_context_error.cpp b/sycl/test-e2e/KernelCompiler/sycl_context_error.cpp index 5392be5636b8d..6f27ee95d84a8 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_context_error.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_context_error.cpp @@ -10,7 +10,7 @@ // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there. // RUN: %{build} -o %t.out -// RUN: %{run-unfiltered-devices} %t.out +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{run-unfiltered-devices} %t.out #include #include diff --git a/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp b/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp index 40f092acc9605..3f10968c45f57 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp @@ -14,7 +14,7 @@ // UNSUPPORTED-TRACKER: GSD-4287 // RUN: %{build} -o %t.out -// RUN: %{l0_leak_check} %{run} %t.out +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out #include #include diff --git a/sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp b/sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp index 62b42c8a6dd26..07992f0e39248 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp @@ -11,7 +11,7 @@ // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there. // RUN: %{build} -o %t.out -// RUN: %{run} %t.out %S | FileCheck %s --check-prefixes=CHECK,CHECK-NOCWD +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{run} %t.out %S | FileCheck %s --check-prefixes=CHECK,CHECK-NOCWD // COM: Run test again in a directory that contains a different version of // `header1.hpp` diff --git a/sycl/test-e2e/KernelCompiler/sycl_join.cpp b/sycl/test-e2e/KernelCompiler/sycl_join.cpp index 26a1dba1a61d2..8d013f9713cc7 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_join.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_join.cpp @@ -12,8 +12,8 @@ // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there. // RUN: %{build} -o %t.out -// RUN: %{run} %t.out -// RUN: %{l0_leak_check} %{run} %t.out +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{run} %t.out +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out // XFAIL: preview-mode && run-mode // XFAIL-TRACKER: https://github.com/intel/llvm/issues/18390 diff --git a/sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp b/sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp index 4fe75db63356b..82b866353e615 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp @@ -12,7 +12,7 @@ // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there. // RUN: %{build} -o %t.out -// RUN: env SYCL_UR_TRACE=-1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} env SYCL_UR_TRACE=-1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s #include #include diff --git a/sycl/test-e2e/KernelCompiler/sycl_namespaces.cpp b/sycl/test-e2e/KernelCompiler/sycl_namespaces.cpp index 6c32f9ac95afb..713c220419c75 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_namespaces.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_namespaces.cpp @@ -12,7 +12,7 @@ // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there. // RUN: %{build} -o %t.out -// RUN: %{l0_leak_check} %{run} %t.out +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out #include #include diff --git a/sycl/test-e2e/KernelCompiler/sycl_overload.cpp b/sycl/test-e2e/KernelCompiler/sycl_overload.cpp index 31553e4dc19c7..00ac6c76fa10c 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_overload.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_overload.cpp @@ -12,7 +12,7 @@ // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there. // RUN: %{build} -o %t.out -// RUN: %{l0_leak_check} %{run} %t.out +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out #include #include diff --git a/sycl/test-e2e/KernelCompiler/sycl_time_trace.cpp b/sycl/test-e2e/KernelCompiler/sycl_time_trace.cpp index c156f4c05b31a..3d149adff8ebe 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_time_trace.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_time_trace.cpp @@ -10,7 +10,7 @@ // UNSUPPORTED-INTENDED: SYCL-RTC is not available for accelerator devices // RUN: %{build} -o %t.out -// RUN: %{run} %t.out | FileCheck %s +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{run} %t.out | FileCheck %s #include #include From 5791d9c81525f5aa9d408efb95146027029cddcb Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Wed, 18 Jun 2025 13:38:46 +0000 Subject: [PATCH 10/26] no run in rm --- sycl/test-e2e/KernelCompiler/sycl_cache.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp index 04c4b910b57fd..d3e5ef6e17b06 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp @@ -21,7 +21,7 @@ // depends on the target. Don't run eviction check for CUDA/HIP, so that we // don't have to find a magic number that works for all binaries (and by // definition is flaky). -// RUN: %{run} %if !(hip || cuda) %{ %{run-aux} rm -rf %t/cache_dir %} +// RUN: %{run-aux} rm -rf %t/cache_dir // RUN: %{run} %if !(hip || cuda) %{ %{cache_vars} %{max_cache_size} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-EVICT %} #include From 08e1ae7c2ed5794502d77c45598a5b9b89a7c4eb Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Mon, 23 Jun 2025 09:55:57 +0000 Subject: [PATCH 11/26] %run --- sycl/test-e2e/KernelCompiler/sycl_cache.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp index d3e5ef6e17b06..cad6dc4a2b09f 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp @@ -15,7 +15,7 @@ // DEFINE: %{max_cache_size} = SYCL_CACHE_MAX_SIZE=30000 // RUN: %{build} -o %t.out // RUN: %{run-aux} rm -rf %t/cache_dir -// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-UNLIM +// RUN: %{run} %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-UNLIM // Eviction mechanism is based on the size of compiled kernels, which in turns // depends on the target. Don't run eviction check for CUDA/HIP, so that we From 1fa01155401264132c8652716e4fe8b6766a8725 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Mon, 23 Jun 2025 08:39:21 -0400 Subject: [PATCH 12/26] Missing SYCL_JIT_AMDGCN_PTX_TARGET_CPU --- sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp b/sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp index 07992f0e39248..b65e25ccb445b 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp @@ -15,7 +15,7 @@ // COM: Run test again in a directory that contains a different version of // `header1.hpp` -// RUN: cd %S/include/C ; %{run} %t.out %S | FileCheck %s --check-prefixes=CHECK,CHECK-CWD +// RUN: cd %S/include/C ; %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{run} %t.out %S | FileCheck %s --check-prefixes=CHECK,CHECK-CWD #include #include From c621906a9650b69813fe3b45516a1fa67594f5d7 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Mon, 23 Jun 2025 12:58:13 +0000 Subject: [PATCH 13/26] sycl cache only on l0 and opencl --- sycl/test-e2e/KernelCompiler/sycl_cache.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp index cad6dc4a2b09f..ee1c0d4a82d3a 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp @@ -22,7 +22,7 @@ // don't have to find a magic number that works for all binaries (and by // definition is flaky). // RUN: %{run-aux} rm -rf %t/cache_dir -// RUN: %{run} %if !(hip || cuda) %{ %{cache_vars} %{max_cache_size} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-EVICT %} +// RUN: %{run} %if (opencl || level_zero) %{ %{cache_vars} %{max_cache_size} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-EVICT %} #include #include From f0e0f72162796c9631a1d8a3f0671ac204b94e4f Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Tue, 24 Jun 2025 09:06:27 +0000 Subject: [PATCH 14/26] run -> run-aux --- sycl/test-e2e/KernelCompiler/sycl_cache.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp index ee1c0d4a82d3a..46df980d04d03 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp @@ -15,14 +15,14 @@ // DEFINE: %{max_cache_size} = SYCL_CACHE_MAX_SIZE=30000 // RUN: %{build} -o %t.out // RUN: %{run-aux} rm -rf %t/cache_dir -// RUN: %{run} %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-UNLIM +// RUN: %{run-aux} %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-UNLIM // Eviction mechanism is based on the size of compiled kernels, which in turns // depends on the target. Don't run eviction check for CUDA/HIP, so that we // don't have to find a magic number that works for all binaries (and by // definition is flaky). // RUN: %{run-aux} rm -rf %t/cache_dir -// RUN: %{run} %if (opencl || level_zero) %{ %{cache_vars} %{max_cache_size} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-EVICT %} +// RUN: %{run-aux}%if (opencl || level_zero) %{ %{cache_vars} %{max_cache_size} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-EVICT %} #include #include From 4f1587bfdf49e8f5a690b06b859840fd674ad67c Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Tue, 24 Jun 2025 10:07:54 +0000 Subject: [PATCH 15/26] space --- sycl/test-e2e/KernelCompiler/sycl_cache.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp index 46df980d04d03..9e51c2d0968f4 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp @@ -22,7 +22,7 @@ // don't have to find a magic number that works for all binaries (and by // definition is flaky). // RUN: %{run-aux} rm -rf %t/cache_dir -// RUN: %{run-aux}%if (opencl || level_zero) %{ %{cache_vars} %{max_cache_size} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-EVICT %} +// RUN: %{run-aux} %if (opencl || level_zero) %{ %{cache_vars} %{max_cache_size} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-EVICT %} #include #include From 0ebc239a9272b8d2dd2a98da6cfe37fa446fedd5 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Wed, 25 Jun 2025 10:18:34 +0000 Subject: [PATCH 16/26] ls --- sycl/test-e2e/KernelCompiler/sycl_cache.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp index 9e51c2d0968f4..e650b9414a4f7 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp @@ -22,7 +22,8 @@ // don't have to find a magic number that works for all binaries (and by // definition is flaky). // RUN: %{run-aux} rm -rf %t/cache_dir -// RUN: %{run-aux} %if (opencl || level_zero) %{ %{cache_vars} %{max_cache_size} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-EVICT %} +// RUN: %{run-aux} %if !(hip || cuda) %{ ls %t/cache_dir %} +// RUN: %{run-aux} %if !(hip || cuda) %{ %{cache_vars} %{max_cache_size} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-EVICT %} #include #include From 494f5474e5bcfe3998c68f0f468cd5b3d9fa5bcc Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Wed, 25 Jun 2025 10:50:32 +0000 Subject: [PATCH 17/26] ls --- sycl/test-e2e/KernelCompiler/sycl_cache.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp index e650b9414a4f7..20c2e6703a1da 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp @@ -22,7 +22,6 @@ // don't have to find a magic number that works for all binaries (and by // definition is flaky). // RUN: %{run-aux} rm -rf %t/cache_dir -// RUN: %{run-aux} %if !(hip || cuda) %{ ls %t/cache_dir %} // RUN: %{run-aux} %if !(hip || cuda) %{ %{cache_vars} %{max_cache_size} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-EVICT %} #include From 8e74e151fb3a77d9d8dffa7507d778e78c8073c7 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Wed, 25 Jun 2025 12:28:48 +0000 Subject: [PATCH 18/26] rm can not be in run line --- sycl/test-e2e/KernelCompiler/sycl_cache.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp index 20c2e6703a1da..cad6dc4a2b09f 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp @@ -15,14 +15,14 @@ // DEFINE: %{max_cache_size} = SYCL_CACHE_MAX_SIZE=30000 // RUN: %{build} -o %t.out // RUN: %{run-aux} rm -rf %t/cache_dir -// RUN: %{run-aux} %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-UNLIM +// RUN: %{run} %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-UNLIM // Eviction mechanism is based on the size of compiled kernels, which in turns // depends on the target. Don't run eviction check for CUDA/HIP, so that we // don't have to find a magic number that works for all binaries (and by // definition is flaky). // RUN: %{run-aux} rm -rf %t/cache_dir -// RUN: %{run-aux} %if !(hip || cuda) %{ %{cache_vars} %{max_cache_size} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-EVICT %} +// RUN: %{run} %if !(hip || cuda) %{ %{cache_vars} %{max_cache_size} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-EVICT %} #include #include From 8eb60bdbfb6c4cd69831a211fe4958bb3c1650e7 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Wed, 25 Jun 2025 13:31:57 +0000 Subject: [PATCH 19/26] rm can not be in run line --- sycl/test-e2e/KernelCompiler/sycl_cache.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp index cad6dc4a2b09f..e81e03645095e 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp @@ -14,14 +14,14 @@ // DEFINE: %{cache_vars} = env SYCL_CACHE_PERSISTENT=1 SYCL_CACHE_TRACE=7 SYCL_CACHE_DIR=%t/cache_dir // DEFINE: %{max_cache_size} = SYCL_CACHE_MAX_SIZE=30000 // RUN: %{build} -o %t.out -// RUN: %{run-aux} rm -rf %t/cache_dir +// RUN: rm -rf %t/cache_dir // RUN: %{run} %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-UNLIM // Eviction mechanism is based on the size of compiled kernels, which in turns // depends on the target. Don't run eviction check for CUDA/HIP, so that we // don't have to find a magic number that works for all binaries (and by // definition is flaky). -// RUN: %{run-aux} rm -rf %t/cache_dir +// RUN: rm -rf %t/cache_dir // RUN: %{run} %if !(hip || cuda) %{ %{cache_vars} %{max_cache_size} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-EVICT %} #include From ed00a53f26c803b422bd690d37409fc24013660f Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Wed, 25 Jun 2025 18:51:05 +0000 Subject: [PATCH 20/26] Make sure to rm the temp dir after first run --- sycl/test-e2e/KernelCompiler/sycl_cache.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp index e81e03645095e..f418afd966a4f 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp @@ -13,16 +13,16 @@ // DEFINE: %{cache_vars} = env SYCL_CACHE_PERSISTENT=1 SYCL_CACHE_TRACE=7 SYCL_CACHE_DIR=%t/cache_dir // DEFINE: %{max_cache_size} = SYCL_CACHE_MAX_SIZE=30000 -// RUN: %{build} -o %t.out -// RUN: rm -rf %t/cache_dir -// RUN: %{run} %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-UNLIM +// RUN: %{run-aux} rm -rf %t/cache_dir +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-UNLIM +// RUN: %{run-aux} rm -rf %t/cache_dir // Eviction mechanism is based on the size of compiled kernels, which in turns // depends on the target. Don't run eviction check for CUDA/HIP, so that we // don't have to find a magic number that works for all binaries (and by // definition is flaky). -// RUN: rm -rf %t/cache_dir -// RUN: %{run} %if !(hip || cuda) %{ %{cache_vars} %{max_cache_size} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-EVICT %} +// RUN: %{run-aux} rm -rf %t/cache_dir +// RUN: %if !(hip || cuda) %{ %{run} %{cache_vars} %{max_cache_size} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-EVICT %} #include #include From 469a670ee658f6da6df9ac9a902908683e2465fd Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Mon, 30 Jun 2025 06:42:19 +0000 Subject: [PATCH 21/26] cache --- sycl/test-e2e/KernelCompiler/sycl_cache.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp index f418afd966a4f..bb098a422c322 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp @@ -22,7 +22,7 @@ // don't have to find a magic number that works for all binaries (and by // definition is flaky). // RUN: %{run-aux} rm -rf %t/cache_dir -// RUN: %if !(hip || cuda) %{ %{run} %{cache_vars} %{max_cache_size} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-EVICT %} +// RUN: %if !(hip || cuda) %{ %{cache_vars} %{max_cache_size} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-EVICT %} #include #include From efb5c32c4ffa7c79ddecb7ebe9d2ecf3d0d07d64 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Mon, 30 Jun 2025 07:26:52 +0000 Subject: [PATCH 22/26] correct num of sycl.hpp --- sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp b/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp index ff128bdc16bea..5b97ae7a0cac9 100644 --- a/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp +++ b/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp @@ -6,7 +6,7 @@ // CHECK-DAG: README.md // CHECK-DAG: lit.cfg.py // -// CHECK-NUM-MATCHES: 24 +// CHECK-NUM-MATCHES: 26 // // This test verifies that `` isn't used in E2E tests. Instead, // fine-grained includes should used, see From 5c1bf6f74aa69f57cdb4c96773bfcc50af5a192c Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Mon, 30 Jun 2025 08:18:39 +0000 Subject: [PATCH 23/26] don't run cache on gpu --- sycl/test-e2e/KernelCompiler/sycl_cache.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp index bb098a422c322..15d3fe829ba70 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp @@ -6,6 +6,12 @@ // //===----------------------------------------------------------------------===// +// REQUIRES: (opencl || level_zero) +// Unlike other RTC tests, don't run this one on Cuda/HIP. Eviction mechanism +// is based on the size of compiled kernels, which in turns depends on the +// target. Don't run eviction check for CUDA/HIP, so that we don't have to find +// a magic number that works for all binaries (and by definition is flaky). + // REQUIRES: aspect-usm_device_allocations // UNSUPPORTED: accelerator @@ -13,16 +19,11 @@ // DEFINE: %{cache_vars} = env SYCL_CACHE_PERSISTENT=1 SYCL_CACHE_TRACE=7 SYCL_CACHE_DIR=%t/cache_dir // DEFINE: %{max_cache_size} = SYCL_CACHE_MAX_SIZE=30000 +// RUN: %{build} -o %t.out // RUN: %{run-aux} rm -rf %t/cache_dir -// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-UNLIM -// RUN: %{run-aux} rm -rf %t/cache_dir - -// Eviction mechanism is based on the size of compiled kernels, which in turns -// depends on the target. Don't run eviction check for CUDA/HIP, so that we -// don't have to find a magic number that works for all binaries (and by -// definition is flaky). +// RUN: %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-UNLIM // RUN: %{run-aux} rm -rf %t/cache_dir -// RUN: %if !(hip || cuda) %{ %{cache_vars} %{max_cache_size} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-EVICT %} +// RUN: %{cache_vars} %{max_cache_size} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-EVICT #include #include From 4b55904a0f35dab1745115ed8f16e069bd75c938 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Tue, 1 Jul 2025 10:39:50 +0000 Subject: [PATCH 24/26] sycl_imf suffers from 18390 --- sycl/test-e2e/KernelCompiler/sycl_imf.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sycl/test-e2e/KernelCompiler/sycl_imf.cpp b/sycl/test-e2e/KernelCompiler/sycl_imf.cpp index 363728b318580..142cc2160ac52 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_imf.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_imf.cpp @@ -15,6 +15,9 @@ // RUN: %{build} -o %t.out // RUN: %{l0_leak_check} %{run} %t.out +// XFAIL: preview-mode && run-mode +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/18390 + #include #include #include From bed6a201b73703dbc70177df11a9efa97662af22 Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Wed, 2 Jul 2025 06:36:19 +0000 Subject: [PATCH 25/26] no sycl hpp update --- sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp b/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp index 5b97ae7a0cac9..4292342068187 100644 --- a/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp +++ b/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp @@ -6,7 +6,7 @@ // CHECK-DAG: README.md // CHECK-DAG: lit.cfg.py // -// CHECK-NUM-MATCHES: 26 +// CHECK-NUM-MATCHES: 28 // // This test verifies that `` isn't used in E2E tests. Instead, // fine-grained includes should used, see From 63f033915120275de1a40eb76777b60addb5783f Mon Sep 17 00:00:00 2001 From: Jakub Chlanda Date: Wed, 2 Jul 2025 10:43:51 +0000 Subject: [PATCH 26/26] ignore features --- sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp index 0a172c8a8c82a..ade0161991b2d 100644 --- a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp +++ b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp @@ -333,8 +333,9 @@ static void adjustArgs(const InputArgList &UserArgList, DAL.AddFlagArg(nullptr, OptTable.getOption(OPT_Qunused_arguments)); if (Format == BinaryFormat::PTX || Format == BinaryFormat::AMDGCN) { - auto [CPU, _] = + auto [CPU, Features] = Translator::getTargetCPUAndFeatureAttrs(nullptr, "", Format); + (void)Features; if (Format == BinaryFormat::AMDGCN) { DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_fsycl_targets_EQ), "amdgcn-amd-amdhsa"); @@ -631,8 +632,9 @@ Error jit_compiler::linkDeviceLibraries(llvm::Module &Module, if (IsCudaHIP) { Triple T{Module.getTargetTriple()}; Driver D{(Twine(DPCPPRoot) + "/bin/clang++").str(), T.getTriple(), Diags}; - auto [CPU, _] = + auto [CPU, Features] = Translator::getTargetCPUAndFeatureAttrs(&Module, "", Format); + (void)Features; // Helper lambda to link modules. auto LinkInLib = [&](const StringRef LibDevice) -> Error { ModuleUPtr LibDeviceModule;