diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index d9a7689115f83..b11ca93c7fe3a 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -1162,6 +1162,20 @@ void CudaToolChain::AddIAMCUIncludeArgs(const ArgList &Args, HostTC.AddIAMCUIncludeArgs(Args, CC1Args); } +llvm::SmallVector +CudaToolChain::getDeviceLibs( + const llvm::opt::ArgList &DriverArgs, + const Action::OffloadKind DeviceOffloadingKind) const { + StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_march_EQ); + std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(GpuArch); + if (LibDeviceFile.empty()) { + getDriver().Diag(diag::err_drv_no_cuda_libdevice) << GpuArch; + return {}; + } + + return {BitCodeLibraryInfo{LibDeviceFile}}; +} + SanitizerMask CudaToolChain::getSupportedSanitizers() const { // The CudaToolChain only supports sanitizers in the sense that it allows // sanitizer arguments on the command line if they are supported by the host diff --git a/clang/lib/Driver/ToolChains/Cuda.h b/clang/lib/Driver/ToolChains/Cuda.h index 3fa95b6f3d2e9..1b1eb59082c2c 100644 --- a/clang/lib/Driver/ToolChains/Cuda.h +++ b/clang/lib/Driver/ToolChains/Cuda.h @@ -248,6 +248,10 @@ class LLVM_LIBRARY_VISIBILITY CudaToolChain : public NVPTXToolChain { void AddIAMCUIncludeArgs(const llvm::opt::ArgList &DriverArgs, llvm::opt::ArgStringList &CC1Args) const override; + llvm::SmallVector + getDeviceLibs(const llvm::opt::ArgList &Args, + const Action::OffloadKind DeviceOffloadingKind) const override; + SanitizerMask getSupportedSanitizers() const override; VersionTuple diff --git a/sycl-jit/jit-compiler/include/RTC.h b/sycl-jit/jit-compiler/include/RTC.h index a1a39d5b50ad3..6a690df459646 100644 --- a/sycl-jit/jit-compiler/include/RTC.h +++ b/sycl-jit/jit-compiler/include/RTC.h @@ -176,10 +176,11 @@ class RTCResult { /// Calculates a BLAKE3 hash of the pre-processed source string described by /// \p SourceFile (considering any additional \p IncludeFiles) and the -/// concatenation of the \p UserArgs. +/// concatenation of the \p UserArgs, for a given \p Format. JIT_EXPORT_SYMBOL RTCHashResult calculateHash(InMemoryFile SourceFile, View IncludeFiles, - View UserArgs); + View UserArgs, + BinaryFormat Format); /// Compiles, links against device libraries, and finalizes the device code in /// the source string described by \p SourceFile, considering any additional \p @@ -191,10 +192,14 @@ JIT_EXPORT_SYMBOL RTCHashResult calculateHash(InMemoryFile SourceFile, /// /// If \p SaveIR is true and \p CachedIR is empty, the LLVM module obtained from /// the frontend invocation is wrapped in bitcode format in the result object. +/// +/// \p BinaryFormat describes the desired format of the compilation - which +/// corresponds to the backend that is being targeted. JIT_EXPORT_SYMBOL RTCResult compileSYCL(InMemoryFile SourceFile, View IncludeFiles, View UserArgs, - View CachedIR, bool SaveIR); + View CachedIR, bool SaveIR, + BinaryFormat Format); /// Requests that the JIT binary referenced by \p Address is deleted from the /// `JITContext`. diff --git a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp index 20891e7615b5c..902071a1fe2b4 100644 --- a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp +++ b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp @@ -8,12 +8,16 @@ #include "DeviceCompilation.h" #include "ESIMD.h" +#include "JITBinaryInfo.h" +#include "translation/Translation.h" #include #include #include #include +#include #include +#include #include #include #include @@ -52,6 +56,7 @@ using namespace llvm::opt; using namespace llvm::sycl; using namespace llvm::module_split; using namespace llvm::util; +using namespace llvm::vfs; using namespace jit_compiler; #ifdef _GNU_SOURCE @@ -313,7 +318,7 @@ class LLVMDiagnosticWrapper : public llvm::DiagnosticHandler { } // anonymous namespace static void adjustArgs(const InputArgList &UserArgList, - const std::string &DPCPPRoot, + const std::string &DPCPPRoot, BinaryFormat Format, SmallVectorImpl &CommandLine) { DerivedArgList DAL{UserArgList}; const auto &OptTable = getDriverOptTable(); @@ -326,6 +331,17 @@ static void adjustArgs(const InputArgList &UserArgList, // unused argument warning. DAL.AddFlagArg(nullptr, OptTable.getOption(OPT_Qunused_arguments)); + if (Format == BinaryFormat::PTX || Format == BinaryFormat::AMDGCN) { + auto [CPU, Features] = + Translator::getTargetCPUAndFeatureAttrs(nullptr, "", Format); + (void)Features; + StringRef OT = Format == BinaryFormat::PTX ? "nvptx64-nvidia-cuda" + : "amdgcn-amd-amdhsa"; + DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_fsycl_targets_EQ), OT); + DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_Xsycl_backend_EQ), OT); + DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_offload_arch_EQ), CPU); + } + ArgStringList ASL; for_each(DAL, [&DAL, &ASL](Arg *A) { A->render(DAL, ASL); }); for_each(UserArgList, @@ -362,10 +378,9 @@ static void setupTool(ClangTool &Tool, const std::string &DPCPPRoot, }); } -Expected -jit_compiler::calculateHash(InMemoryFile SourceFile, - View IncludeFiles, - const InputArgList &UserArgList) { +Expected jit_compiler::calculateHash( + InMemoryFile SourceFile, View IncludeFiles, + const InputArgList &UserArgList, BinaryFormat Format) { TimeTraceScope TTS{"calculateHash"}; const std::string &DPCPPRoot = getDPCPPRoot(); @@ -374,7 +389,7 @@ jit_compiler::calculateHash(InMemoryFile SourceFile, } SmallVector CommandLine; - adjustArgs(UserArgList, DPCPPRoot, CommandLine); + adjustArgs(UserArgList, DPCPPRoot, Format, CommandLine); FixedCompilationDatabase DB{".", CommandLine}; ClangTool Tool{DB, {SourceFile.Path}}; @@ -400,11 +415,10 @@ jit_compiler::calculateHash(InMemoryFile SourceFile, return createStringError("Calculating source hash failed"); } -Expected -jit_compiler::compileDeviceCode(InMemoryFile SourceFile, - View IncludeFiles, - const InputArgList &UserArgList, - std::string &BuildLog, LLVMContext &Context) { +Expected jit_compiler::compileDeviceCode( + InMemoryFile SourceFile, View IncludeFiles, + const InputArgList &UserArgList, std::string &BuildLog, + LLVMContext &Context, BinaryFormat Format) { TimeTraceScope TTS{"compileDeviceCode"}; const std::string &DPCPPRoot = getDPCPPRoot(); @@ -413,7 +427,7 @@ jit_compiler::compileDeviceCode(InMemoryFile SourceFile, } SmallVector CommandLine; - adjustArgs(UserArgList, DPCPPRoot, CommandLine); + adjustArgs(UserArgList, DPCPPRoot, Format, CommandLine); FixedCompilationDatabase DB{".", CommandLine}; ClangTool Tool{DB, {SourceFile.Path}}; @@ -431,12 +445,22 @@ jit_compiler::compileDeviceCode(InMemoryFile SourceFile, return createStringError(BuildLog); } -// This function is a simplified copy of the device library selection process in -// `clang::driver::tools::SYCL::getDeviceLibraries`, assuming a SPIR-V target -// (no AoT, no third-party GPUs, no native CPU). Keep in sync! +// This function is a simplified copy of the device library selection process +// in `clang::driver::tools::SYCL::getDeviceLibraries`, assuming a SPIR-V, or +// GPU targets (no AoT, no native CPU). Keep in sync! static bool getDeviceLibraries(const ArgList &Args, SmallVectorImpl &LibraryList, - DiagnosticsEngine &Diags) { + DiagnosticsEngine &Diags, BinaryFormat Format) { + // For CUDA/HIP we only need devicelib, early exit here. + if (Format == BinaryFormat::PTX) { + LibraryList.push_back( + Args.MakeArgString("devicelib-nvptx64-nvidia-cuda.bc")); + return false; + } else if (Format == BinaryFormat::AMDGCN) { + LibraryList.push_back(Args.MakeArgString("devicelib-amdgcn-amd-amdhsa.bc")); + return false; + } + struct DeviceLibOptInfo { StringRef DeviceLibName; StringRef DeviceLibOption; @@ -541,7 +565,8 @@ static Expected loadBitcodeLibrary(StringRef LibPath, Error jit_compiler::linkDeviceLibraries(llvm::Module &Module, const InputArgList &UserArgList, - std::string &BuildLog) { + std::string &BuildLog, + BinaryFormat Format) { TimeTraceScope TTS{"linkDeviceLibraries"}; const std::string &DPCPPRoot = getDPCPPRoot(); @@ -556,11 +581,29 @@ Error jit_compiler::linkDeviceLibraries(llvm::Module &Module, /* ShouldOwnClient=*/false); SmallVector LibNames; - bool FoundUnknownLib = getDeviceLibraries(UserArgList, LibNames, Diags); + const bool FoundUnknownLib = + getDeviceLibraries(UserArgList, LibNames, Diags, Format); if (FoundUnknownLib) { return createStringError("Could not determine list of device libraries: %s", BuildLog.c_str()); } + const bool IsCudaHIP = + Format == BinaryFormat::PTX || Format == BinaryFormat::AMDGCN; + if (IsCudaHIP) { + // Based on the OS and the format decide on the version of libspirv. + // NOTE: this will be problematic if cross-compiling between OSes. + std::string Libclc{"clc/"}; + Libclc.append( +#ifdef _WIN32 + "remangled-l32-signed_char.libspirv-" +#else + "remangled-l64-signed_char.libspirv-" +#endif + ); + Libclc.append(Format == BinaryFormat::PTX ? "nvptx64-nvidia-cuda.bc" + : "amdgcn-amd-amdhsa.bc"); + LibNames.push_back(Libclc); + } LLVMContext &Context = Module.getContext(); for (const std::string &LibName : LibNames) { @@ -578,6 +621,72 @@ Error jit_compiler::linkDeviceLibraries(llvm::Module &Module, } } + // For GPU targets we need to link against vendor provided libdevice. + if (IsCudaHIP) { + std::string Argv0 = DPCPPRoot + "/bin/clang++"; + Triple T{Module.getTargetTriple()}; + IntrusiveRefCntPtr OFS{ + new OverlayFileSystem{getRealFileSystem()}}; + IntrusiveRefCntPtr VFS{new InMemoryFileSystem}; + std::string CppFileName{"a.cpp"}; + VFS->addFile(CppFileName, /*ModificationTime=*/0, + MemoryBuffer::getMemBuffer("", "")); + OFS->pushOverlay(VFS); + Driver D{Argv0, T.getTriple(), Diags, "dpcpp compiler driver", OFS}; + + SmallVector CommandLine; + CommandLine.push_back(Argv0); + adjustArgs(UserArgList, DPCPPRoot, Format, CommandLine); + CommandLine.push_back(CppFileName); + SmallVector CommandLineCStr(CommandLine.size()); + llvm::transform(CommandLine, CommandLineCStr.begin(), + [](const auto &S) { return S.c_str(); }); + + Compilation *C = D.BuildCompilation(CommandLineCStr); + if (!C) { + return createStringError("Unable to construct driver for CUDA/HIP"); + } + + const ToolChain *OffloadTC = + C->getSingleOffloadToolChain(); + InputArgList EmptyArgList; + auto Archs = + D.getOffloadArchs(*C, EmptyArgList, Action::OFK_SYCL, OffloadTC); + assert(Archs.size() == 1 && + "Offload toolchain should be configured to single architecture"); + StringRef CPU = *Archs.begin(); + + // Pass only `-march=` or `-mcpu=` with the GPU arch determined by the + // driver to `getDeviceLibs`. + DerivedArgList CPUArgList{EmptyArgList}; + if (Format == BinaryFormat::PTX) { + CPUArgList.AddJoinedArg(nullptr, D.getOpts().getOption(OPT_march_EQ), + CPU); + } else { + CPUArgList.AddJoinedArg(nullptr, D.getOpts().getOption(OPT_mcpu_EQ), CPU); + } + + SmallVector CommonDeviceLibs = + OffloadTC->getDeviceLibs(CPUArgList, Action::OffloadKind::OFK_SYCL); + if (CommonDeviceLibs.empty()) { + return createStringError("Unable to find common device libraries"); + } + + for (auto &Lib : CommonDeviceLibs) { + ModuleUPtr LibModule; + if (auto Error = + loadBitcodeLibrary(Lib.Path, Context).moveInto(LibModule)) { + return Error; + } + + if (Linker::linkModules(Module, std::move(LibModule), + Linker::LinkOnlyNeeded)) { + return createStringError("Unable to link device library %s: %s", + Lib.Path.c_str(), BuildLog.c_str()); + } + } + } + return Error::success(); } diff --git a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.h b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.h index 62da2cdb54bf1..aa1b19df0cc8f 100644 --- a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.h +++ b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.h @@ -8,6 +8,7 @@ #pragma once +#include "JITBinaryInfo.h" #include "RTC.h" #include @@ -24,16 +25,17 @@ using ModuleUPtr = std::unique_ptr; llvm::Expected calculateHash(InMemoryFile SourceFile, View IncludeFiles, - const llvm::opt::InputArgList &UserArgList); + const llvm::opt::InputArgList &UserArgList, BinaryFormat Format); llvm::Expected compileDeviceCode(InMemoryFile SourceFile, View IncludeFiles, const llvm::opt::InputArgList &UserArgList, - std::string &BuildLog, llvm::LLVMContext &Context); + std::string &BuildLog, llvm::LLVMContext &Context, + BinaryFormat Format); llvm::Error linkDeviceLibraries(llvm::Module &Module, const llvm::opt::InputArgList &UserArgList, - std::string &BuildLog); + std::string &BuildLog, BinaryFormat Format); using PostLinkResult = std::pair>; llvm::Expected diff --git a/sycl-jit/jit-compiler/lib/rtc/RTC.cpp b/sycl-jit/jit-compiler/lib/rtc/RTC.cpp index b26d6212fe3e5..d54d2bcd2b863 100644 --- a/sycl-jit/jit-compiler/lib/rtc/RTC.cpp +++ b/sycl-jit/jit-compiler/lib/rtc/RTC.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "RTC.h" +#include "JITBinaryInfo.h" #include "helper/ErrorHelper.h" #include "rtc/DeviceCompilation.h" #include "translation/SPIRVLLVMTranslation.h" @@ -26,7 +27,8 @@ using namespace jit_compiler; JIT_EXPORT_SYMBOL RTCHashResult calculateHash(InMemoryFile SourceFile, View IncludeFiles, - View UserArgs) { + View UserArgs, + BinaryFormat Format) { llvm::opt::InputArgList UserArgList; if (auto Error = parseUserArgs(UserArgs).moveInto(UserArgList)) { return errorTo(std::move(Error), @@ -36,8 +38,8 @@ JIT_EXPORT_SYMBOL RTCHashResult calculateHash(InMemoryFile SourceFile, auto Start = std::chrono::high_resolution_clock::now(); std::string Hash; - if (auto Error = - calculateHash(SourceFile, IncludeFiles, UserArgList).moveInto(Hash)) { + if (auto Error = calculateHash(SourceFile, IncludeFiles, UserArgList, Format) + .moveInto(Hash)) { return errorTo(std::move(Error), "Hashing failed", /*IsHash=*/false); } @@ -55,7 +57,8 @@ JIT_EXPORT_SYMBOL RTCHashResult calculateHash(InMemoryFile SourceFile, JIT_EXPORT_SYMBOL RTCResult compileSYCL(InMemoryFile SourceFile, View IncludeFiles, View UserArgs, - View CachedIR, bool SaveIR) { + View CachedIR, bool SaveIR, + BinaryFormat Format) { llvm::LLVMContext Context; std::string BuildLog; configureDiagnostics(Context, BuildLog); @@ -104,7 +107,7 @@ JIT_EXPORT_SYMBOL RTCResult compileSYCL(InMemoryFile SourceFile, bool FromSource = !Module; if (FromSource) { if (auto Error = compileDeviceCode(SourceFile, IncludeFiles, UserArgList, - BuildLog, Context) + BuildLog, Context, Format) .moveInto(Module)) { return errorTo(std::move(Error), "Device compilation failed"); } @@ -118,7 +121,8 @@ JIT_EXPORT_SYMBOL RTCResult compileSYCL(InMemoryFile SourceFile, IR = RTCDeviceCodeIR{BCString.data(), BCString.data() + BCString.size()}; } - if (auto Error = linkDeviceLibraries(*Module, UserArgList, BuildLog)) { + if (auto Error = + linkDeviceLibraries(*Module, UserArgList, BuildLog, Format)) { return errorTo(std::move(Error), "Device linking failed"); } @@ -131,9 +135,9 @@ JIT_EXPORT_SYMBOL RTCResult compileSYCL(InMemoryFile SourceFile, for (auto [DevImgInfo, Module] : llvm::zip_equal(BundleInfo.DevImgInfos, Modules)) { - if (auto Error = Translator::translate(*Module, JITContext::getInstance(), - BinaryFormat::SPIRV) - .moveInto(DevImgInfo.BinaryInfo)) { + if (auto Error = + Translator::translate(*Module, JITContext::getInstance(), Format) + .moveInto(DevImgInfo.BinaryInfo)) { return errorTo(std::move(Error), "SPIR-V translation failed"); } } diff --git a/sycl-jit/jit-compiler/lib/translation/Translation.cpp b/sycl-jit/jit-compiler/lib/translation/Translation.cpp index 457b6bfc0e637..4acd6c81c7129 100644 --- a/sycl-jit/jit-compiler/lib/translation/Translation.cpp +++ b/sycl-jit/jit-compiler/lib/translation/Translation.cpp @@ -83,9 +83,6 @@ llvm::Expected Translator::translateToPTX(llvm::Module &Mod, LLVMInitializeNVPTXAsmPrinter(); LLVMInitializeNVPTXTargetMC(); - static const char *TARGET_CPU_ATTRIBUTE = "target-cpu"; - static const char *TARGET_FEATURE_ATTRIBUTE = "target-features"; - std::string TargetTriple{"nvptx64-nvidia-cuda"}; std::string ErrorMessage; @@ -99,32 +96,11 @@ llvm::Expected Translator::translateToPTX(llvm::Module &Mod, ErrorMessage.c_str()); } - // Give priority to user specified values (through environment variables: - // SYCL_JIT_AMDGCN_PTX_TARGET_CPU and SYCL_JIT_AMDGCN_PTX_TARGET_FEATURES). - auto CPUVal = ConfigHelper::get(); - auto FeaturesVal = ConfigHelper::get(); - llvm::StringRef CPU = CPUVal.begin(); - llvm::StringRef Features = FeaturesVal.begin(); - - auto *KernelFunc = KernelName ? Mod.getFunction(KernelName) : nullptr; - // If they were not set, use default and consult the module for alternatives - // (if present). - if (CPU.empty()) { - CPU = "sm_50"; - if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) { - CPU = KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString(); - } - } - if (Features.empty()) { - Features = "+sm_50,+ptx76"; - if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) { - Features = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE) - .getValueAsString(); - } - } + auto [CPU, Features] = + getTargetCPUAndFeatureAttrs(&Mod, KernelName, BinaryFormat::PTX); std::unique_ptr TargetMachine(Target->createTargetMachine( - Mod.getTargetTriple(), CPU, Features, {}, llvm::Reloc::PIC_, std::nullopt, + Triple{TargetTriple}, CPU, Features, {}, llvm::Reloc::PIC_, std::nullopt, llvm::CodeGenOptLevel::Default)); llvm::legacy::PassManager PM; @@ -166,9 +142,6 @@ Translator::translateToAMDGCN(llvm::Module &Mod, JITContext &JITCtx, LLVMInitializeAMDGPUAsmPrinter(); LLVMInitializeAMDGPUTargetMC(); - static const char *TARGET_CPU_ATTRIBUTE = "target-cpu"; - static const char *TARGET_FEATURE_ATTRIBUTE = "target-features"; - std::string TargetTriple{"amdgcn-amd-amdhsa"}; std::string ErrorMessage; @@ -181,29 +154,10 @@ Translator::translateToAMDGCN(llvm::Module &Mod, JITContext &JITCtx, "Failed to load and translate AMDGCN LLVM IR module with error %s", ErrorMessage.c_str()); - auto CPUVal = ConfigHelper::get(); - auto FeaturesVal = ConfigHelper::get(); - llvm::StringRef CPU = CPUVal.begin(); - llvm::StringRef Features = FeaturesVal.begin(); - - auto *KernelFunc = KernelName ? Mod.getFunction(KernelName) : nullptr; - if (CPU.empty()) { - // Set to the lowest tested target according to the GetStartedGuide, section - // "Build DPC++ toolchain with support for HIP AMD" - CPU = "gfx906"; - if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) { - CPU = KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString(); - } - } - if (Features.empty()) { - if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) { - Features = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE) - .getValueAsString(); - } - } - + auto [CPU, Features] = + getTargetCPUAndFeatureAttrs(&Mod, KernelName, BinaryFormat::AMDGCN); std::unique_ptr TargetMachine(Target->createTargetMachine( - Mod.getTargetTriple(), CPU, Features, {}, llvm::Reloc::PIC_, std::nullopt, + Triple{TargetTriple}, CPU, Features, {}, llvm::Reloc::PIC_, std::nullopt, llvm::CodeGenOptLevel::Default)); std::string AMDObj; @@ -226,3 +180,34 @@ Translator::translateToAMDGCN(llvm::Module &Mod, JITContext &JITCtx, return &JITCtx.emplaceBinary(std::move(AMDObj), BinaryFormat::AMDGCN); #endif // JIT_SUPPORT_AMDGCN } + +std::pair Translator::getTargetCPUAndFeatureAttrs( + const llvm::Module *M, const char *KernelName, BinaryFormat Format) { + assert((Format == BinaryFormat::AMDGCN || Format == BinaryFormat::PTX) && + "Unexpected format found"); + static const char *TARGET_CPU_ATTRIBUTE = "target-cpu"; + static const char *TARGET_FEATURE_ATTRIBUTE = "target-features"; + // Give priority to user specified values (through environment variables: + // SYCL_JIT_AMDGCN_PTX_TARGET_CPU and SYCL_JIT_AMDGCN_PTX_TARGET_FEATURES). + auto *KernelFunc = (M && KernelName) ? M->getFunction(KernelName) : nullptr; + auto CPUVal = ConfigHelper::get(); + auto FeaturesVal = ConfigHelper::get(); + llvm::StringRef CPU{CPUVal.begin(), CPUVal.size()}; + llvm::StringRef Features{FeaturesVal.begin(), FeaturesVal.size()}; + if (CPU.empty()) { + // Set to the lowest tested target according to the GetStartedGuide, section + // "Build DPC++ toolchain with support for HIP AMD" + CPU = Format == BinaryFormat::AMDGCN ? "gfx90a" : "sm_50"; + if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) { + CPU = KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString(); + } + } + if (Features.empty()) { + Features = Format == BinaryFormat::PTX ? "+sm_50,+ptx76" : ""; + if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) { + Features = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE) + .getValueAsString(); + } + } + return std::make_pair(std::string{CPU}, std::string{Features}); +} diff --git a/sycl-jit/jit-compiler/lib/translation/Translation.h b/sycl-jit/jit-compiler/lib/translation/Translation.h index 9c8dfdda5f9f5..78bf4ef8ac416 100644 --- a/sycl-jit/jit-compiler/lib/translation/Translation.h +++ b/sycl-jit/jit-compiler/lib/translation/Translation.h @@ -12,7 +12,6 @@ #include "JITContext.h" #include "llvm/IR/Module.h" #include "llvm/Support/Error.h" -#include namespace jit_compiler { @@ -26,6 +25,9 @@ class Translator { translate(llvm::Module &Mod, JITContext &JITCtx, BinaryFormat Format, const char *KernelName = nullptr); + std::pair static getTargetCPUAndFeatureAttrs( + const llvm::Module *M, const char *KernelName, BinaryFormat Format); + private: /// Pair of address and size to represent a binary blob. using BinaryBlob = std::pair; diff --git a/sycl/source/detail/device_image_impl.hpp b/sycl/source/detail/device_image_impl.hpp index b4956a82ce305..0cbc009cc88c7 100644 --- a/sycl/source/detail/device_image_impl.hpp +++ b/sycl/source/detail/device_image_impl.hpp @@ -8,6 +8,9 @@ #pragma once +#if SYCL_EXT_JIT_ENABLE +#include "JITBinaryInfo.h" +#endif // SYCL_EXT_JIT_ENABLE #include #include #include @@ -702,6 +705,28 @@ class device_image_impl return MRTCBinInfo && MRTCBinInfo->MLanguage == Lang; } + static ::jit_compiler::BinaryFormat + getTargetFormat([[maybe_unused]] const backend Backend) { +#if SYCL_EXT_JIT_ENABLE + switch (Backend) { + case backend::ext_oneapi_level_zero: + case backend::opencl: + return ::jit_compiler::BinaryFormat::SPIRV; + case backend::ext_oneapi_cuda: + return ::jit_compiler::BinaryFormat::PTX; + case backend::ext_oneapi_hip: + return ::jit_compiler::BinaryFormat::AMDGCN; + default: + throw sycl::exception( + sycl::make_error_code(sycl::errc::invalid), + "Backend does not support kernel_compiler extension"); + } +#else + throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), + "JIT not supported"); +#endif // SYCL_EXT_JIT_ENABLE + } + std::vector> buildFromSource( const std::vector &Devices, const std::vector &BuildOptions, @@ -732,9 +757,12 @@ class device_image_impl } } - if (MRTCBinInfo->MLanguage == syclex::source_language::sycl) + if (MRTCBinInfo->MLanguage == syclex::source_language::sycl) { + const auto Format = getTargetFormat(MContext.get_backend()); return createSYCLImages(Devices, bundle_state::executable, BuildOptions, - LogPtr, RegisteredKernelNames, OutDeviceBins); + LogPtr, RegisteredKernelNames, OutDeviceBins, + Format); + } std::vector DeviceVec; DeviceVec.reserve(Devices.size()); @@ -822,8 +850,10 @@ class device_image_impl "device does not support source language"); } } + const auto Format = getTargetFormat(MContext.get_backend()); return createSYCLImages(Devices, bundle_state::object, CompileOptions, - LogPtr, RegisteredKernelNames, OutDeviceBins); + LogPtr, RegisteredKernelNames, OutDeviceBins, + Format); } private: @@ -995,8 +1025,8 @@ class device_image_impl const std::vector &Options, std::string *LogPtr, const std::vector &RegisteredKernelNames, - std::vector> &OutDeviceBins) - const { + std::vector> &OutDeviceBins, + ::jit_compiler::BinaryFormat Format) const { assert(MRTCBinInfo); assert(MRTCBinInfo->MLanguage == syclex::source_language::sycl); assert(std::holds_alternative(MBinImage)); @@ -1026,7 +1056,7 @@ class device_image_impl auto [Binaries, Prefix] = syclex::detail::SYCL_JIT_Compile( RegisteredKernelNames.empty() ? SourceStr : SourceExt.str(), - MRTCBinInfo->MIncludePairs, Options, LogPtr); + MRTCBinInfo->MIncludePairs, Options, LogPtr, Format); auto &PM = detail::ProgramManager::getInstance(); diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index 189bd3d145309..2884e25399e5f 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -263,6 +263,7 @@ sycl_device_binaries jit_compiler::createDeviceBinaries( Binary.addOffloadEntry(std::move(Entry)); } + bool FinalizationTagAdded = false; for (const auto &FPS : DevImgInfo.Properties) { bool IsDeviceGlobalsPropSet = FPS.Name == __SYCL_PROPERTY_SET_SYCL_DEVICE_GLOBALS; @@ -279,18 +280,50 @@ sycl_device_binaries jit_compiler::createDeviceBinaries( sycl_property_type::SYCL_PROPERTY_TYPE_BYTE_ARRAY}); } } + if (FPS.Name == __SYCL_PROPERTY_SET_PROGRAM_METADATA && + DevImgInfo.BinaryInfo.Format == + ::jit_compiler::BinaryFormat::AMDGCN) { + PropSet.addProperty(PropertyContainer{ + __SYCL_PROGRAM_METADATA_TAG_NEED_FINALIZATION, 1}); + FinalizationTagAdded = true; + } Binary.addProperty(std::move(PropSet)); Binary.setCompileOptions(BundleInfo.CompileOptions.c_str()); } - Collection->addDeviceBinary(std::move(Binary), - DevImgInfo.BinaryInfo.BinaryStart, - DevImgInfo.BinaryInfo.BinarySize, - (DevImgInfo.BinaryInfo.AddressBits == 64) - ? __SYCL_DEVICE_BINARY_TARGET_SPIRV64 - : __SYCL_DEVICE_BINARY_TARGET_SPIRV32, - SYCL_DEVICE_BINARY_TYPE_SPIRV); + auto BinaryTarget = ""; + auto Format = SYCL_DEVICE_BINARY_TYPE_NONE; + switch (DevImgInfo.BinaryInfo.Format) { + case ::jit_compiler::BinaryFormat::SPIRV: + BinaryTarget = DevImgInfo.BinaryInfo.AddressBits == 64 + ? __SYCL_DEVICE_BINARY_TARGET_SPIRV64 + : __SYCL_DEVICE_BINARY_TARGET_SPIRV32; + Format = SYCL_DEVICE_BINARY_TYPE_SPIRV; + break; + case ::jit_compiler::BinaryFormat::PTX: + BinaryTarget = __SYCL_DEVICE_BINARY_TARGET_LLVM_NVPTX64; + Format = SYCL_DEVICE_BINARY_TYPE_NONE; + break; + case ::jit_compiler::BinaryFormat::AMDGCN: { + BinaryTarget = __SYCL_DEVICE_BINARY_TARGET_LLVM_AMDGCN; + Format = SYCL_DEVICE_BINARY_TYPE_NONE; + // If the program had no properties, the tag needs to be added now. + if (!FinalizationTagAdded) { + PropertySetContainer ProgramMetadata{ + __SYCL_PROPERTY_SET_PROGRAM_METADATA}; + ProgramMetadata.addProperty(PropertyContainer{ + __SYCL_PROGRAM_METADATA_TAG_NEED_FINALIZATION, 1}); + Binary.addProperty(std::move(ProgramMetadata)); + } + break; + } + default: + assert(false && "Unsupported format"); + }; + Collection->addDeviceBinary( + std::move(Binary), DevImgInfo.BinaryInfo.BinaryStart, + DevImgInfo.BinaryInfo.BinarySize, BinaryTarget, Format); } sycl_device_binaries Binaries = Collection->getPIDeviceStruct(); @@ -311,7 +344,26 @@ void jit_compiler::destroyDeviceBinaries(sycl_device_binaries Binaries) { std::pair jit_compiler::compileSYCL( const std::string &CompilationID, const std::string &SYCLSource, const std::vector> &IncludePairs, - const std::vector &UserArgs, std::string *LogPtr) { + const std::vector &UserArgs, std::string *LogPtr, + ::jit_compiler::BinaryFormat Format) { + if (Format == ::jit_compiler::BinaryFormat::PTX || + Format == ::jit_compiler::BinaryFormat::AMDGCN) { + // If present, set-up the config with env variables describing CPU and + // features. + auto SetUpOption = [](const std::string &Value) { + ::jit_compiler::JITEnvVar Option(Value.begin(), Value.end()); + return Option; + }; + ::jit_compiler::JITEnvVar TargetCPUOpt = SetUpOption( + detail::SYCLConfig::get()); + this->AddToConfigHandle( + ::jit_compiler::option::JITTargetCPU::set(TargetCPUOpt)); + ::jit_compiler::JITEnvVar TargetFeaturesOpt = SetUpOption( + detail::SYCLConfig::get()); + this->AddToConfigHandle( + ::jit_compiler::option::JITTargetFeatures::set(TargetFeaturesOpt)); + } + auto appendToLog = [LogPtr](const char *Msg) { if (LogPtr) { LogPtr->append(Msg); @@ -339,7 +391,7 @@ std::pair jit_compiler::compileSYCL( std::vector CachedIR; if (PersistentDeviceCodeCache::isEnabled()) { auto Result = - CalculateHashHandle(SourceFile, IncludeFilesView, UserArgsView); + CalculateHashHandle(SourceFile, IncludeFilesView, UserArgsView, Format); if (Result.failed()) { appendToLog(Result.getPreprocLog()); @@ -349,8 +401,9 @@ std::pair jit_compiler::compileSYCL( } } - auto Result = CompileSYCLHandle(SourceFile, IncludeFilesView, UserArgsView, - CachedIR, /*SaveIR=*/!CacheKey.empty()); + auto Result = + CompileSYCLHandle(SourceFile, IncludeFilesView, UserArgsView, CachedIR, + /*SaveIR=*/!CacheKey.empty(), Format); const char *BuildLog = Result.getBuildLog(); appendToLog(BuildLog); diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp index 31f87cd2be7e0..ac3ebf7d73fa4 100644 --- a/sycl/source/detail/jit_compiler.hpp +++ b/sycl/source/detail/jit_compiler.hpp @@ -47,7 +47,8 @@ class jit_compiler { std::pair compileSYCL( const std::string &CompilationID, const std::string &SYCLSource, const std::vector> &IncludePairs, - const std::vector &UserArgs, std::string *LogPtr); + const std::vector &UserArgs, std::string *LogPtr, + ::jit_compiler::BinaryFormat Format); void destroyDeviceBinaries(sycl_device_binaries Binaries); diff --git a/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.cpp b/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.cpp index 93ec94a8ac328..418842fdc48ec 100644 --- a/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.cpp +++ b/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.cpp @@ -44,7 +44,8 @@ std::pair SYCL_JIT_Compile( [[maybe_unused]] const std::string &SYCLSource, [[maybe_unused]] const include_pairs_t &IncludePairs, [[maybe_unused]] const std::vector &UserArgs, - [[maybe_unused]] std::string *LogPtr) { + [[maybe_unused]] std::string *LogPtr, + [[maybe_unused]] ::jit_compiler::BinaryFormat Format) { #if SYCL_EXT_JIT_ENABLE static std::atomic_uintptr_t CompilationCounter; std::string CompilationID = "rtc_" + std::to_string(CompilationCounter++); @@ -52,7 +53,7 @@ std::pair SYCL_JIT_Compile( for (const sycl::detail::string_view UserArg : UserArgs) UserArgStrings.push_back(UserArg.data()); return sycl::detail::jit_compiler::get_instance().compileSYCL( - CompilationID, SYCLSource, IncludePairs, UserArgStrings, LogPtr); + CompilationID, SYCLSource, IncludePairs, UserArgStrings, LogPtr, Format); #else throw sycl::exception(sycl::errc::build, "kernel_compiler via sycl-jit is not available"); diff --git a/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.hpp b/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.hpp index c088122a33ff5..0dfaf63a1d8da 100644 --- a/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.hpp +++ b/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.hpp @@ -8,6 +8,9 @@ #pragma once +#if SYCL_EXT_JIT_ENABLE +#include "JITBinaryInfo.h" +#endif // SYCL_EXT_JIT_ENABLE #include #include // __SYCL_EXPORT #include @@ -39,7 +42,7 @@ userArgsAsString(const std::vector &UserArguments); std::pair SYCL_JIT_Compile(const std::string &Source, const include_pairs_t &IncludePairs, const std::vector &UserArgs, - std::string *LogPtr); + std::string *LogPtr, ::jit_compiler::BinaryFormat Format); void SYCL_JIT_Destroy(sycl_device_binaries Binaries); diff --git a/sycl/source/kernel_bundle.cpp b/sycl/source/kernel_bundle.cpp index fcc380c9a537b..c18bdbc46a62c 100644 --- a/sycl/source/kernel_bundle.cpp +++ b/sycl/source/kernel_bundle.cpp @@ -409,8 +409,10 @@ bool is_source_kernel_bundle_supported( const std::vector &DeviceImplVec) { backend BE = DeviceImplVec[0]->getBackend(); // Support is limited to the opencl and level_zero backends. - bool BE_Acceptable = (BE == sycl::backend::ext_oneapi_level_zero) || - (BE == sycl::backend::opencl); + bool BE_Acceptable = BE == sycl::backend::ext_oneapi_level_zero || + BE == sycl::backend::opencl || + BE == sycl::backend::ext_oneapi_hip || + BE == sycl::backend::ext_oneapi_cuda; if (!BE_Acceptable) return false; diff --git a/sycl/test-e2e/KernelCompiler/sycl.cpp b/sycl/test-e2e/KernelCompiler/sycl.cpp index 3095d570be946..13003bea98a62 100644 --- a/sycl/test-e2e/KernelCompiler/sycl.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl.cpp @@ -6,11 +6,10 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_device_allocations // RUN: %{build} -o %t.out -// RUN: %{l0_leak_check} %{run} %t.out +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out // XFAIL: preview-mode && run-mode // XFAIL-TRACKER: https://github.com/intel/llvm/issues/18390 @@ -108,15 +107,14 @@ auto constexpr DeviceLibrariesSource = R"===( #include #include #include -#include extern "C" SYCL_EXTERNAL SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(sycl::ext::oneapi::experimental::single_task_kernel) void device_libs_kernel(float *ptr) { // Extension list: llvm/lib/SYCLLowerIR/SYCLDeviceLibReqMask.cpp - // cl_intel_devicelib_assert is not available for opencl:gpu; skip testing it. - // Only test the fp32 variants of complex, math and imf to keep this test + // cl_intel_devicelib_assert is not available for opencl:gpu; skip testing + // it. Only test the fp32 variants of complex and math to keep this test // device-agnostic. // cl_intel_devicelib_math @@ -128,14 +126,8 @@ void device_libs_kernel(float *ptr) { // cl_intel_devicelib_cstring ptr[2] = memcmp(ptr + 2, ptr + 2, sizeof(float)); - // cl_intel_devicelib_imf - ptr[3] = sycl::ext::intel::math::sqrt(ptr[3] * 2); - - // cl_intel_devicelib_imf_bf16 - ptr[4] = sycl::ext::intel::math::float2bfloat16(ptr[4] * 0.5f); - // cl_intel_devicelib_bfloat16 - ptr[5] = sycl::ext::oneapi::bfloat16{ptr[5] / 0.25f}; + ptr[3] = sycl::ext::oneapi::bfloat16{ptr[3] / 0.25f}; } )==="; @@ -356,7 +348,7 @@ int test_device_libraries(sycl::queue q) { exe_kb kbExe = syclex::build(kbSrc); sycl::kernel k = kbExe.ext_oneapi_get_kernel("device_libs_kernel"); - constexpr size_t nElem = 6; + constexpr size_t nElem = 4; float *ptr = sycl::malloc_shared(nElem, q); for (int i = 0; i < nElem; ++i) ptr[i] = 1.0f; diff --git a/sycl/test-e2e/KernelCompiler/sycl_basic.cpp b/sycl/test-e2e/KernelCompiler/sycl_basic.cpp index 5832d5ff50685..fbb7768fd8ac8 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_basic.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_basic.cpp @@ -6,11 +6,10 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_device_allocations // RUN: %{build} -o %t.out -// RUN: %{l0_leak_check} %{run} %t.out +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out #include #include diff --git a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp index 405d97ce15e96..0cffec11ffa47 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp @@ -7,6 +7,11 @@ //===----------------------------------------------------------------------===// // REQUIRES: (opencl || level_zero) +// Unlike other RTC tests, don't run this one on Cuda/HIP. Eviction mechanism +// is based on the size of compiled kernels, which in turns depends on the +// target. Don't run eviction check for CUDA/HIP, so that we don't have to find +// a magic number that works for all binaries (and by definition is flaky). + // REQUIRES: aspect-usm_device_allocations // DEFINE: %{cache_vars} = env SYCL_CACHE_PERSISTENT=1 SYCL_CACHE_TRACE=7 SYCL_CACHE_DIR=%t/cache_dir diff --git a/sycl/test-e2e/KernelCompiler/sycl_context_error.cpp b/sycl/test-e2e/KernelCompiler/sycl_context_error.cpp index e72c733fdbf9f..7d41f3a3f79d0 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_context_error.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_context_error.cpp @@ -6,10 +6,8 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) - // RUN: %{build} -o %t.out -// RUN: %{run-unfiltered-devices} %t.out +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{run-unfiltered-devices} %t.out #include #include diff --git a/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp b/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp index 2296aba1d9504..ba8b08584df81 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp @@ -6,14 +6,13 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_device_allocations // UNSUPPORTED: opencl && gpu // UNSUPPORTED-TRACKER: GSD-4287 // RUN: %{build} -o %t.out -// RUN: %{l0_leak_check} %{run} %t.out +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out #include #include diff --git a/sycl/test-e2e/KernelCompiler/sycl_export_registration.cpp b/sycl/test-e2e/KernelCompiler/sycl_export_registration.cpp index 410f4f3e8d753..4ca1f5296dc7b 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_export_registration.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_export_registration.cpp @@ -8,6 +8,8 @@ // REQUIRES: (opencl || level_zero) +// Note linking is not supported on CUDA/HIP. + // -- Test for a case where a kernel bundle is built that exports a symbol and // -- other kernel bundles that uses it are compiled/linked without it. These // -- cases should fail due to unresolved symbols, rather than picking up the diff --git a/sycl/test-e2e/KernelCompiler/sycl_imf.cpp b/sycl/test-e2e/KernelCompiler/sycl_imf.cpp new file mode 100644 index 0000000000000..142cc2160ac52 --- /dev/null +++ b/sycl/test-e2e/KernelCompiler/sycl_imf.cpp @@ -0,0 +1,79 @@ +//==--- sycl_imf.cpp --- kernel_compiler extension imf tests ---------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: aspect-usm_device_allocations +// REQUIRES: (opencl || level_zero) + +// UNSUPPORTED: accelerator +// UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there. + +// RUN: %{build} -o %t.out +// RUN: %{l0_leak_check} %{run} %t.out + +// XFAIL: preview-mode && run-mode +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/18390 + +#include +#include +#include + +auto constexpr IMFSources = R"===( +#include +#include +#include + +extern "C" SYCL_EXTERNAL +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(sycl::ext::oneapi::experimental::single_task_kernel) +void imf_kernel(float *ptr) { + // cl_intel_devicelib_imf + ptr[0] = sycl::ext::intel::math::sqrt(ptr[0] * 2); + + // cl_intel_devicelib_imf_bf16 + ptr[1] = sycl::ext::intel::math::float2bfloat16(ptr[1] * 0.5f); +} +)==="; + +int main() { +#ifdef SYCL_EXT_ONEAPI_KERNEL_COMPILER + sycl::queue q; + namespace syclex = sycl::ext::oneapi::experimental; + using source_kb = sycl::kernel_bundle; + using exe_kb = sycl::kernel_bundle; + + sycl::context ctx = q.get_context(); + + source_kb kbSrc = syclex::create_kernel_bundle_from_source( + ctx, syclex::source_language::sycl, IMFSources); + exe_kb kbExe = syclex::build(kbSrc); + + sycl::kernel k = kbExe.ext_oneapi_get_kernel("imf_kernel"); + constexpr size_t nElem = 2; + float *ptr = sycl::malloc_shared(nElem, q); + for (int i = 0; i < nElem; ++i) + ptr[i] = 1.0f; + + q.submit([&](sycl::handler &cgh) { + cgh.set_arg(0, ptr); + cgh.single_task(k); + }); + q.wait_and_throw(); + + // Check that the kernel was executed. Given the {1.0, 1.0} input, + // the expected result is approximately {1.41, 0.5}. + for (unsigned i = 0; i < nElem; ++i) { + std::cout << ptr[i] << ' '; + assert(ptr[i] != 1.0f); + } + std::cout << std::endl; + + sycl::free(ptr, q); +#else + static_assert(false, "Kernel Compiler feature test macro undefined"); +#endif + return 0; +} diff --git a/sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp b/sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp index 0cc50413dbad4..d073390a3d6a9 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp @@ -6,15 +6,14 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_device_allocations // RUN: %{build} -o %t.out -// RUN: %{run} %t.out %S | FileCheck %s --check-prefixes=CHECK,CHECK-NOCWD +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{run} %t.out %S | FileCheck %s --check-prefixes=CHECK,CHECK-NOCWD // COM: Run test again in a directory that contains a different version of // `header1.hpp` -// RUN: cd %S/include/C ; %{run} %t.out %S | FileCheck %s --check-prefixes=CHECK,CHECK-CWD +// RUN: cd %S/include/C ; %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{run} %t.out %S | FileCheck %s --check-prefixes=CHECK,CHECK-CWD #include #include diff --git a/sycl/test-e2e/KernelCompiler/sycl_join.cpp b/sycl/test-e2e/KernelCompiler/sycl_join.cpp index 33cf81dd6bdcb..1cc0c8e8d2d8e 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_join.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_join.cpp @@ -6,12 +6,11 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_shared_allocations // RUN: %{build} -o %t.out -// RUN: %{run} %t.out -// RUN: %{l0_leak_check} %{run} %t.out +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{run} %t.out +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out // XFAIL: preview-mode && run-mode // XFAIL-TRACKER: https://github.com/intel/llvm/issues/18390 diff --git a/sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp b/sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp index 94864926af4c4..7d38e44cab6c2 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp @@ -6,11 +6,10 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_device_allocations // RUN: %{build} -o %t.out -// RUN: env SYCL_UR_TRACE=-1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} env SYCL_UR_TRACE=-1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s #include #include @@ -48,14 +47,15 @@ int test_lifetimes() { ctx, syclex::source_language::sycl, SYCLSource); exe_kb kbExe1 = syclex::build(kbSrc); - // CHECK: urProgramCreateWithIL{{.*}}phProgram{{.*}}([[PROG1:.*]])) + // Cuda/Hip programs will be created with Binary, spirv IL. + // CHECK: urProgramCreateWith{{IL|Binary}}{{.*}}phProgram{{.*}}([[PROG1:.*]])) { std::cout << "Scope1\n"; // CHECK: Scope1 exe_kb kbExe2 = syclex::build(kbSrc); // kbExe2 goes out of scope; its kernels are removed from program mananager. - // CHECK: urProgramCreateWithIL{{.*}}phProgram{{.*}}([[PROG2:.*]])) + // CHECK: urProgramCreateWith{{IL|Binary}}{{.*}}phProgram{{.*}}([[PROG2:.*]])) // CHECK: urProgramRelease{{.*}}[[PROG2]] } std::cout << "End Scope1\n"; diff --git a/sycl/test-e2e/KernelCompiler/sycl_link.cpp b/sycl/test-e2e/KernelCompiler/sycl_link.cpp index 7187488d0f0dc..1453e02b773dc 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_link.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_link.cpp @@ -12,6 +12,8 @@ // -- Test for a simple linking case with source files compiled from SYCL source // -- at runtime. +// Note linking is not supported on CUDA/HIP. + // RUN: %{build} -o %t.out // RUN: %{run} %t.out // RUN: %{l0_leak_check} %{run} %t.out diff --git a/sycl/test-e2e/KernelCompiler/sycl_link_common_dep.cpp b/sycl/test-e2e/KernelCompiler/sycl_link_common_dep.cpp index b1d66710c2fa2..f639a9bb3a337 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_link_common_dep.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_link_common_dep.cpp @@ -9,6 +9,8 @@ // REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_shared_allocations +// Note linking is not supported on CUDA/HIP. + // -- Test for linking where two kernels use the same imported symbols. // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/KernelCompiler/sycl_link_common_dep_optional_feature.cpp b/sycl/test-e2e/KernelCompiler/sycl_link_common_dep_optional_feature.cpp index 4976a9066bc3e..5183a927881e1 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_link_common_dep_optional_feature.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_link_common_dep_optional_feature.cpp @@ -9,6 +9,8 @@ // REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_shared_allocations +// Note linking is not supported on CUDA/HIP. + // -- Test for linking where two kernels use the same imported symbols, but one // -- may not be supported on the device. diff --git a/sycl/test-e2e/KernelCompiler/sycl_link_export_conflict.cpp b/sycl/test-e2e/KernelCompiler/sycl_link_export_conflict.cpp index 4f81eeb684fe4..0739a5d9da00a 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_link_export_conflict.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_link_export_conflict.cpp @@ -9,6 +9,8 @@ // REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_shared_allocations +// Note linking is not supported on CUDA/HIP. + // -- Test for a case where a kernel bundle with an exported symbol is compiled // -- before another kernel bundle using a different variant of the symbol. diff --git a/sycl/test-e2e/KernelCompiler/sycl_link_kernel_conflict.cpp b/sycl/test-e2e/KernelCompiler/sycl_link_kernel_conflict.cpp index 41b8c99c137a3..0636fbff37bf7 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_link_kernel_conflict.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_link_kernel_conflict.cpp @@ -9,6 +9,8 @@ // REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_shared_allocations +// Note linking is not supported on CUDA/HIP. + // -- Test for the linking of two kernels with conflicting definitions of // -- kernels with the same name. diff --git a/sycl/test-e2e/KernelCompiler/sycl_namespaces.cpp b/sycl/test-e2e/KernelCompiler/sycl_namespaces.cpp index 89ad5f5e67c40..29c54161506f9 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_namespaces.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_namespaces.cpp @@ -6,11 +6,10 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_device_allocations // RUN: %{build} -o %t.out -// RUN: %{l0_leak_check} %{run} %t.out +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out #include #include diff --git a/sycl/test-e2e/KernelCompiler/sycl_overload.cpp b/sycl/test-e2e/KernelCompiler/sycl_overload.cpp index 2e7b5fee432a5..a1b6cb787d030 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_overload.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_overload.cpp @@ -6,11 +6,10 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) // REQUIRES: aspect-usm_device_allocations // RUN: %{build} -o %t.out -// RUN: %{l0_leak_check} %{run} %t.out +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out #include #include diff --git a/sycl/test-e2e/KernelCompiler/sycl_time_trace.cpp b/sycl/test-e2e/KernelCompiler/sycl_time_trace.cpp index d3916dfc3803b..4b1a04d1bcb8d 100644 --- a/sycl/test-e2e/KernelCompiler/sycl_time_trace.cpp +++ b/sycl/test-e2e/KernelCompiler/sycl_time_trace.cpp @@ -5,9 +5,9 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// REQUIRES: (opencl || level_zero) + // RUN: %{build} -o %t.out -// RUN: %{run} %t.out | FileCheck %s +// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{run} %t.out | FileCheck %s #include #include diff --git a/sycl/test-e2e/format.py b/sycl/test-e2e/format.py index 44da0ef1c52c4..ff2f8e36ecfb7 100644 --- a/sycl/test-e2e/format.py +++ b/sycl/test-e2e/format.py @@ -244,6 +244,7 @@ def execute(self, test, litConfig): ) sycl_target_opts += hip_arch_opts substitutions.append(("%{hip_arch_opts}", hip_arch_opts)) + substitutions.append(("%{amd_arch}", test.config.amd_arch)) if ( "target-spir" in build_targets and "spirv-backend" in test.config.available_features diff --git a/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp b/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp index 4292342068187..7f3757fc70624 100644 --- a/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp +++ b/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp @@ -6,7 +6,7 @@ // CHECK-DAG: README.md // CHECK-DAG: lit.cfg.py // -// CHECK-NUM-MATCHES: 28 +// CHECK-NUM-MATCHES: 29 // // This test verifies that `` isn't used in E2E tests. Instead, // fine-grained includes should used, see