diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index d9a7689115f83..b11ca93c7fe3a 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -1162,6 +1162,20 @@ void CudaToolChain::AddIAMCUIncludeArgs(const ArgList &Args,
   HostTC.AddIAMCUIncludeArgs(Args, CC1Args);
 }
 
+llvm::SmallVector<ToolChain::BitCodeLibraryInfo, 12>
+CudaToolChain::getDeviceLibs(
+    const llvm::opt::ArgList &DriverArgs,
+    const Action::OffloadKind DeviceOffloadingKind) const {
+  StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_march_EQ);
+  std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(GpuArch);
+  if (LibDeviceFile.empty()) {
+    getDriver().Diag(diag::err_drv_no_cuda_libdevice) << GpuArch;
+    return {};
+  }
+
+  return {BitCodeLibraryInfo{LibDeviceFile}};
+}
+
 SanitizerMask CudaToolChain::getSupportedSanitizers() const {
   // The CudaToolChain only supports sanitizers in the sense that it allows
   // sanitizer arguments on the command line if they are supported by the host
diff --git a/clang/lib/Driver/ToolChains/Cuda.h b/clang/lib/Driver/ToolChains/Cuda.h
index 3fa95b6f3d2e9..1b1eb59082c2c 100644
--- a/clang/lib/Driver/ToolChains/Cuda.h
+++ b/clang/lib/Driver/ToolChains/Cuda.h
@@ -248,6 +248,10 @@ class LLVM_LIBRARY_VISIBILITY CudaToolChain : public NVPTXToolChain {
   void AddIAMCUIncludeArgs(const llvm::opt::ArgList &DriverArgs,
                            llvm::opt::ArgStringList &CC1Args) const override;
 
+  llvm::SmallVector<BitCodeLibraryInfo, 12>
+  getDeviceLibs(const llvm::opt::ArgList &Args,
+                const Action::OffloadKind DeviceOffloadingKind) const override;
+
   SanitizerMask getSupportedSanitizers() const override;
 
   VersionTuple
diff --git a/sycl-jit/jit-compiler/include/RTC.h b/sycl-jit/jit-compiler/include/RTC.h
index a1a39d5b50ad3..6a690df459646 100644
--- a/sycl-jit/jit-compiler/include/RTC.h
+++ b/sycl-jit/jit-compiler/include/RTC.h
@@ -176,10 +176,11 @@ class RTCResult {
 
 /// Calculates a BLAKE3 hash of the pre-processed source string described by
 /// \p SourceFile (considering any additional \p IncludeFiles) and the
-/// concatenation of the \p UserArgs.
+/// concatenation of the \p UserArgs, for a given \p Format.
 JIT_EXPORT_SYMBOL RTCHashResult calculateHash(InMemoryFile SourceFile,
                                               View<InMemoryFile> IncludeFiles,
-                                              View<const char *> UserArgs);
+                                              View<const char *> UserArgs,
+                                              BinaryFormat Format);
 
 /// Compiles, links against device libraries, and finalizes the device code in
 /// the source string described by \p SourceFile, considering any additional \p
@@ -191,10 +192,14 @@ JIT_EXPORT_SYMBOL RTCHashResult calculateHash(InMemoryFile SourceFile,
 ///
 /// If \p SaveIR is true and \p CachedIR is empty, the LLVM module obtained from
 /// the frontend invocation is wrapped in bitcode format in the result object.
+///
+/// \p BinaryFormat describes the desired format of the compilation - which
+/// corresponds to the backend that is being targeted.
 JIT_EXPORT_SYMBOL RTCResult compileSYCL(InMemoryFile SourceFile,
                                         View<InMemoryFile> IncludeFiles,
                                         View<const char *> UserArgs,
-                                        View<char> CachedIR, bool SaveIR);
+                                        View<char> CachedIR, bool SaveIR,
+                                        BinaryFormat Format);
 
 /// Requests that the JIT binary referenced by \p Address is deleted from the
 /// `JITContext`.
diff --git a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp
index 20891e7615b5c..902071a1fe2b4 100644
--- a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp
+++ b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp
@@ -8,12 +8,16 @@
 
 #include "DeviceCompilation.h"
 #include "ESIMD.h"
+#include "JITBinaryInfo.h"
+#include "translation/Translation.h"
 
 #include <clang/Basic/DiagnosticDriver.h>
 #include <clang/Basic/Version.h>
 #include <clang/CodeGen/CodeGenAction.h>
 #include <clang/Driver/Compilation.h>
+#include <clang/Driver/Driver.h>
 #include <clang/Driver/Options.h>
+#include <clang/Driver/ToolChain.h>
 #include <clang/Frontend/ChainedDiagnosticConsumer.h>
 #include <clang/Frontend/CompilerInstance.h>
 #include <clang/Frontend/FrontendActions.h>
@@ -52,6 +56,7 @@ using namespace llvm::opt;
 using namespace llvm::sycl;
 using namespace llvm::module_split;
 using namespace llvm::util;
+using namespace llvm::vfs;
 using namespace jit_compiler;
 
 #ifdef _GNU_SOURCE
@@ -313,7 +318,7 @@ class LLVMDiagnosticWrapper : public llvm::DiagnosticHandler {
 } // anonymous namespace
 
 static void adjustArgs(const InputArgList &UserArgList,
-                       const std::string &DPCPPRoot,
+                       const std::string &DPCPPRoot, BinaryFormat Format,
                        SmallVectorImpl<std::string> &CommandLine) {
   DerivedArgList DAL{UserArgList};
   const auto &OptTable = getDriverOptTable();
@@ -326,6 +331,17 @@ static void adjustArgs(const InputArgList &UserArgList,
   // unused argument warning.
   DAL.AddFlagArg(nullptr, OptTable.getOption(OPT_Qunused_arguments));
 
+  if (Format == BinaryFormat::PTX || Format == BinaryFormat::AMDGCN) {
+    auto [CPU, Features] =
+        Translator::getTargetCPUAndFeatureAttrs(nullptr, "", Format);
+    (void)Features;
+    StringRef OT = Format == BinaryFormat::PTX ? "nvptx64-nvidia-cuda"
+                                               : "amdgcn-amd-amdhsa";
+    DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_fsycl_targets_EQ), OT);
+    DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_Xsycl_backend_EQ), OT);
+    DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_offload_arch_EQ), CPU);
+  }
+
   ArgStringList ASL;
   for_each(DAL, [&DAL, &ASL](Arg *A) { A->render(DAL, ASL); });
   for_each(UserArgList,
@@ -362,10 +378,9 @@ static void setupTool(ClangTool &Tool, const std::string &DPCPPRoot,
       });
 }
 
-Expected<std::string>
-jit_compiler::calculateHash(InMemoryFile SourceFile,
-                            View<InMemoryFile> IncludeFiles,
-                            const InputArgList &UserArgList) {
+Expected<std::string> jit_compiler::calculateHash(
+    InMemoryFile SourceFile, View<InMemoryFile> IncludeFiles,
+    const InputArgList &UserArgList, BinaryFormat Format) {
   TimeTraceScope TTS{"calculateHash"};
 
   const std::string &DPCPPRoot = getDPCPPRoot();
@@ -374,7 +389,7 @@ jit_compiler::calculateHash(InMemoryFile SourceFile,
   }
 
   SmallVector<std::string> CommandLine;
-  adjustArgs(UserArgList, DPCPPRoot, CommandLine);
+  adjustArgs(UserArgList, DPCPPRoot, Format, CommandLine);
 
   FixedCompilationDatabase DB{".", CommandLine};
   ClangTool Tool{DB, {SourceFile.Path}};
@@ -400,11 +415,10 @@ jit_compiler::calculateHash(InMemoryFile SourceFile,
   return createStringError("Calculating source hash failed");
 }
 
-Expected<ModuleUPtr>
-jit_compiler::compileDeviceCode(InMemoryFile SourceFile,
-                                View<InMemoryFile> IncludeFiles,
-                                const InputArgList &UserArgList,
-                                std::string &BuildLog, LLVMContext &Context) {
+Expected<ModuleUPtr> jit_compiler::compileDeviceCode(
+    InMemoryFile SourceFile, View<InMemoryFile> IncludeFiles,
+    const InputArgList &UserArgList, std::string &BuildLog,
+    LLVMContext &Context, BinaryFormat Format) {
   TimeTraceScope TTS{"compileDeviceCode"};
 
   const std::string &DPCPPRoot = getDPCPPRoot();
@@ -413,7 +427,7 @@ jit_compiler::compileDeviceCode(InMemoryFile SourceFile,
   }
 
   SmallVector<std::string> CommandLine;
-  adjustArgs(UserArgList, DPCPPRoot, CommandLine);
+  adjustArgs(UserArgList, DPCPPRoot, Format, CommandLine);
 
   FixedCompilationDatabase DB{".", CommandLine};
   ClangTool Tool{DB, {SourceFile.Path}};
@@ -431,12 +445,22 @@ jit_compiler::compileDeviceCode(InMemoryFile SourceFile,
   return createStringError(BuildLog);
 }
 
-// This function is a simplified copy of the device library selection process in
-// `clang::driver::tools::SYCL::getDeviceLibraries`, assuming a SPIR-V target
-// (no AoT, no third-party GPUs, no native CPU). Keep in sync!
+// This function is a simplified copy of the device library selection process
+// in `clang::driver::tools::SYCL::getDeviceLibraries`, assuming a SPIR-V, or
+// GPU targets (no AoT, no native CPU). Keep in sync!
 static bool getDeviceLibraries(const ArgList &Args,
                                SmallVectorImpl<std::string> &LibraryList,
-                               DiagnosticsEngine &Diags) {
+                               DiagnosticsEngine &Diags, BinaryFormat Format) {
+  // For CUDA/HIP we only need devicelib, early exit here.
+  if (Format == BinaryFormat::PTX) {
+    LibraryList.push_back(
+        Args.MakeArgString("devicelib-nvptx64-nvidia-cuda.bc"));
+    return false;
+  } else if (Format == BinaryFormat::AMDGCN) {
+    LibraryList.push_back(Args.MakeArgString("devicelib-amdgcn-amd-amdhsa.bc"));
+    return false;
+  }
+
   struct DeviceLibOptInfo {
     StringRef DeviceLibName;
     StringRef DeviceLibOption;
@@ -541,7 +565,8 @@ static Expected<ModuleUPtr> loadBitcodeLibrary(StringRef LibPath,
 
 Error jit_compiler::linkDeviceLibraries(llvm::Module &Module,
                                         const InputArgList &UserArgList,
-                                        std::string &BuildLog) {
+                                        std::string &BuildLog,
+                                        BinaryFormat Format) {
   TimeTraceScope TTS{"linkDeviceLibraries"};
 
   const std::string &DPCPPRoot = getDPCPPRoot();
@@ -556,11 +581,29 @@ Error jit_compiler::linkDeviceLibraries(llvm::Module &Module,
                           /* ShouldOwnClient=*/false);
 
   SmallVector<std::string> LibNames;
-  bool FoundUnknownLib = getDeviceLibraries(UserArgList, LibNames, Diags);
+  const bool FoundUnknownLib =
+      getDeviceLibraries(UserArgList, LibNames, Diags, Format);
   if (FoundUnknownLib) {
     return createStringError("Could not determine list of device libraries: %s",
                              BuildLog.c_str());
   }
+  const bool IsCudaHIP =
+      Format == BinaryFormat::PTX || Format == BinaryFormat::AMDGCN;
+  if (IsCudaHIP) {
+    // Based on the OS and the format decide on the version of libspirv.
+    // NOTE: this will be problematic if cross-compiling between OSes.
+    std::string Libclc{"clc/"};
+    Libclc.append(
+#ifdef _WIN32
+        "remangled-l32-signed_char.libspirv-"
+#else
+        "remangled-l64-signed_char.libspirv-"
+#endif
+    );
+    Libclc.append(Format == BinaryFormat::PTX ? "nvptx64-nvidia-cuda.bc"
+                                              : "amdgcn-amd-amdhsa.bc");
+    LibNames.push_back(Libclc);
+  }
 
   LLVMContext &Context = Module.getContext();
   for (const std::string &LibName : LibNames) {
@@ -578,6 +621,72 @@ Error jit_compiler::linkDeviceLibraries(llvm::Module &Module,
     }
   }
 
+  // For GPU targets we need to link against vendor provided libdevice.
+  if (IsCudaHIP) {
+    std::string Argv0 = DPCPPRoot + "/bin/clang++";
+    Triple T{Module.getTargetTriple()};
+    IntrusiveRefCntPtr<OverlayFileSystem> OFS{
+        new OverlayFileSystem{getRealFileSystem()}};
+    IntrusiveRefCntPtr<InMemoryFileSystem> VFS{new InMemoryFileSystem};
+    std::string CppFileName{"a.cpp"};
+    VFS->addFile(CppFileName, /*ModificationTime=*/0,
+                 MemoryBuffer::getMemBuffer("", ""));
+    OFS->pushOverlay(VFS);
+    Driver D{Argv0, T.getTriple(), Diags, "dpcpp compiler driver", OFS};
+
+    SmallVector<std::string> CommandLine;
+    CommandLine.push_back(Argv0);
+    adjustArgs(UserArgList, DPCPPRoot, Format, CommandLine);
+    CommandLine.push_back(CppFileName);
+    SmallVector<const char *> CommandLineCStr(CommandLine.size());
+    llvm::transform(CommandLine, CommandLineCStr.begin(),
+                    [](const auto &S) { return S.c_str(); });
+
+    Compilation *C = D.BuildCompilation(CommandLineCStr);
+    if (!C) {
+      return createStringError("Unable to construct driver for CUDA/HIP");
+    }
+
+    const ToolChain *OffloadTC =
+        C->getSingleOffloadToolChain<Action::OFK_SYCL>();
+    InputArgList EmptyArgList;
+    auto Archs =
+        D.getOffloadArchs(*C, EmptyArgList, Action::OFK_SYCL, OffloadTC);
+    assert(Archs.size() == 1 &&
+           "Offload toolchain should be configured to single architecture");
+    StringRef CPU = *Archs.begin();
+
+    // Pass only `-march=` or `-mcpu=` with the GPU arch determined by the
+    // driver to `getDeviceLibs`.
+    DerivedArgList CPUArgList{EmptyArgList};
+    if (Format == BinaryFormat::PTX) {
+      CPUArgList.AddJoinedArg(nullptr, D.getOpts().getOption(OPT_march_EQ),
+                              CPU);
+    } else {
+      CPUArgList.AddJoinedArg(nullptr, D.getOpts().getOption(OPT_mcpu_EQ), CPU);
+    }
+
+    SmallVector<ToolChain::BitCodeLibraryInfo, 12> CommonDeviceLibs =
+        OffloadTC->getDeviceLibs(CPUArgList, Action::OffloadKind::OFK_SYCL);
+    if (CommonDeviceLibs.empty()) {
+      return createStringError("Unable to find common device libraries");
+    }
+
+    for (auto &Lib : CommonDeviceLibs) {
+      ModuleUPtr LibModule;
+      if (auto Error =
+              loadBitcodeLibrary(Lib.Path, Context).moveInto(LibModule)) {
+        return Error;
+      }
+
+      if (Linker::linkModules(Module, std::move(LibModule),
+                              Linker::LinkOnlyNeeded)) {
+        return createStringError("Unable to link device library %s: %s",
+                                 Lib.Path.c_str(), BuildLog.c_str());
+      }
+    }
+  }
+
   return Error::success();
 }
 
diff --git a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.h b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.h
index 62da2cdb54bf1..aa1b19df0cc8f 100644
--- a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.h
+++ b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.h
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include "JITBinaryInfo.h"
 #include "RTC.h"
 
 #include <llvm/ADT/SmallVector.h>
@@ -24,16 +25,17 @@ using ModuleUPtr = std::unique_ptr<llvm::Module>;
 
 llvm::Expected<std::string>
 calculateHash(InMemoryFile SourceFile, View<InMemoryFile> IncludeFiles,
-              const llvm::opt::InputArgList &UserArgList);
+              const llvm::opt::InputArgList &UserArgList, BinaryFormat Format);
 
 llvm::Expected<ModuleUPtr>
 compileDeviceCode(InMemoryFile SourceFile, View<InMemoryFile> IncludeFiles,
                   const llvm::opt::InputArgList &UserArgList,
-                  std::string &BuildLog, llvm::LLVMContext &Context);
+                  std::string &BuildLog, llvm::LLVMContext &Context,
+                  BinaryFormat Format);
 
 llvm::Error linkDeviceLibraries(llvm::Module &Module,
                                 const llvm::opt::InputArgList &UserArgList,
-                                std::string &BuildLog);
+                                std::string &BuildLog, BinaryFormat Format);
 
 using PostLinkResult = std::pair<RTCBundleInfo, llvm::SmallVector<ModuleUPtr>>;
 llvm::Expected<PostLinkResult>
diff --git a/sycl-jit/jit-compiler/lib/rtc/RTC.cpp b/sycl-jit/jit-compiler/lib/rtc/RTC.cpp
index b26d6212fe3e5..d54d2bcd2b863 100644
--- a/sycl-jit/jit-compiler/lib/rtc/RTC.cpp
+++ b/sycl-jit/jit-compiler/lib/rtc/RTC.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "RTC.h"
+#include "JITBinaryInfo.h"
 #include "helper/ErrorHelper.h"
 #include "rtc/DeviceCompilation.h"
 #include "translation/SPIRVLLVMTranslation.h"
@@ -26,7 +27,8 @@ using namespace jit_compiler;
 
 JIT_EXPORT_SYMBOL RTCHashResult calculateHash(InMemoryFile SourceFile,
                                               View<InMemoryFile> IncludeFiles,
-                                              View<const char *> UserArgs) {
+                                              View<const char *> UserArgs,
+                                              BinaryFormat Format) {
   llvm::opt::InputArgList UserArgList;
   if (auto Error = parseUserArgs(UserArgs).moveInto(UserArgList)) {
     return errorTo<RTCHashResult>(std::move(Error),
@@ -36,8 +38,8 @@ JIT_EXPORT_SYMBOL RTCHashResult calculateHash(InMemoryFile SourceFile,
 
   auto Start = std::chrono::high_resolution_clock::now();
   std::string Hash;
-  if (auto Error =
-          calculateHash(SourceFile, IncludeFiles, UserArgList).moveInto(Hash)) {
+  if (auto Error = calculateHash(SourceFile, IncludeFiles, UserArgList, Format)
+                       .moveInto(Hash)) {
     return errorTo<RTCHashResult>(std::move(Error), "Hashing failed",
                                   /*IsHash=*/false);
   }
@@ -55,7 +57,8 @@ JIT_EXPORT_SYMBOL RTCHashResult calculateHash(InMemoryFile SourceFile,
 JIT_EXPORT_SYMBOL RTCResult compileSYCL(InMemoryFile SourceFile,
                                         View<InMemoryFile> IncludeFiles,
                                         View<const char *> UserArgs,
-                                        View<char> CachedIR, bool SaveIR) {
+                                        View<char> CachedIR, bool SaveIR,
+                                        BinaryFormat Format) {
   llvm::LLVMContext Context;
   std::string BuildLog;
   configureDiagnostics(Context, BuildLog);
@@ -104,7 +107,7 @@ JIT_EXPORT_SYMBOL RTCResult compileSYCL(InMemoryFile SourceFile,
   bool FromSource = !Module;
   if (FromSource) {
     if (auto Error = compileDeviceCode(SourceFile, IncludeFiles, UserArgList,
-                                       BuildLog, Context)
+                                       BuildLog, Context, Format)
                          .moveInto(Module)) {
       return errorTo<RTCResult>(std::move(Error), "Device compilation failed");
     }
@@ -118,7 +121,8 @@ JIT_EXPORT_SYMBOL RTCResult compileSYCL(InMemoryFile SourceFile,
     IR = RTCDeviceCodeIR{BCString.data(), BCString.data() + BCString.size()};
   }
 
-  if (auto Error = linkDeviceLibraries(*Module, UserArgList, BuildLog)) {
+  if (auto Error =
+          linkDeviceLibraries(*Module, UserArgList, BuildLog, Format)) {
     return errorTo<RTCResult>(std::move(Error), "Device linking failed");
   }
 
@@ -131,9 +135,9 @@ JIT_EXPORT_SYMBOL RTCResult compileSYCL(InMemoryFile SourceFile,
 
   for (auto [DevImgInfo, Module] :
        llvm::zip_equal(BundleInfo.DevImgInfos, Modules)) {
-    if (auto Error = Translator::translate(*Module, JITContext::getInstance(),
-                                           BinaryFormat::SPIRV)
-                         .moveInto(DevImgInfo.BinaryInfo)) {
+    if (auto Error =
+            Translator::translate(*Module, JITContext::getInstance(), Format)
+                .moveInto(DevImgInfo.BinaryInfo)) {
       return errorTo<RTCResult>(std::move(Error), "SPIR-V translation failed");
     }
   }
diff --git a/sycl-jit/jit-compiler/lib/translation/Translation.cpp b/sycl-jit/jit-compiler/lib/translation/Translation.cpp
index 457b6bfc0e637..4acd6c81c7129 100644
--- a/sycl-jit/jit-compiler/lib/translation/Translation.cpp
+++ b/sycl-jit/jit-compiler/lib/translation/Translation.cpp
@@ -83,9 +83,6 @@ llvm::Expected<JITBinary *> Translator::translateToPTX(llvm::Module &Mod,
   LLVMInitializeNVPTXAsmPrinter();
   LLVMInitializeNVPTXTargetMC();
 
-  static const char *TARGET_CPU_ATTRIBUTE = "target-cpu";
-  static const char *TARGET_FEATURE_ATTRIBUTE = "target-features";
-
   std::string TargetTriple{"nvptx64-nvidia-cuda"};
 
   std::string ErrorMessage;
@@ -99,32 +96,11 @@ llvm::Expected<JITBinary *> Translator::translateToPTX(llvm::Module &Mod,
         ErrorMessage.c_str());
   }
 
-  // Give priority to user specified values (through environment variables:
-  // SYCL_JIT_AMDGCN_PTX_TARGET_CPU and SYCL_JIT_AMDGCN_PTX_TARGET_FEATURES).
-  auto CPUVal = ConfigHelper::get<option::JITTargetCPU>();
-  auto FeaturesVal = ConfigHelper::get<option::JITTargetFeatures>();
-  llvm::StringRef CPU = CPUVal.begin();
-  llvm::StringRef Features = FeaturesVal.begin();
-
-  auto *KernelFunc = KernelName ? Mod.getFunction(KernelName) : nullptr;
-  // If they were not set, use default and consult the module for alternatives
-  // (if present).
-  if (CPU.empty()) {
-    CPU = "sm_50";
-    if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) {
-      CPU = KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString();
-    }
-  }
-  if (Features.empty()) {
-    Features = "+sm_50,+ptx76";
-    if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) {
-      Features = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE)
-                     .getValueAsString();
-    }
-  }
+  auto [CPU, Features] =
+      getTargetCPUAndFeatureAttrs(&Mod, KernelName, BinaryFormat::PTX);
 
   std::unique_ptr<TargetMachine> TargetMachine(Target->createTargetMachine(
-      Mod.getTargetTriple(), CPU, Features, {}, llvm::Reloc::PIC_, std::nullopt,
+      Triple{TargetTriple}, CPU, Features, {}, llvm::Reloc::PIC_, std::nullopt,
       llvm::CodeGenOptLevel::Default));
 
   llvm::legacy::PassManager PM;
@@ -166,9 +142,6 @@ Translator::translateToAMDGCN(llvm::Module &Mod, JITContext &JITCtx,
   LLVMInitializeAMDGPUAsmPrinter();
   LLVMInitializeAMDGPUTargetMC();
 
-  static const char *TARGET_CPU_ATTRIBUTE = "target-cpu";
-  static const char *TARGET_FEATURE_ATTRIBUTE = "target-features";
-
   std::string TargetTriple{"amdgcn-amd-amdhsa"};
 
   std::string ErrorMessage;
@@ -181,29 +154,10 @@ Translator::translateToAMDGCN(llvm::Module &Mod, JITContext &JITCtx,
         "Failed to load and translate AMDGCN LLVM IR module with error %s",
         ErrorMessage.c_str());
 
-  auto CPUVal = ConfigHelper::get<option::JITTargetCPU>();
-  auto FeaturesVal = ConfigHelper::get<option::JITTargetFeatures>();
-  llvm::StringRef CPU = CPUVal.begin();
-  llvm::StringRef Features = FeaturesVal.begin();
-
-  auto *KernelFunc = KernelName ? Mod.getFunction(KernelName) : nullptr;
-  if (CPU.empty()) {
-    // Set to the lowest tested target according to the GetStartedGuide, section
-    // "Build DPC++ toolchain with support for HIP AMD"
-    CPU = "gfx906";
-    if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) {
-      CPU = KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString();
-    }
-  }
-  if (Features.empty()) {
-    if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) {
-      Features = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE)
-                     .getValueAsString();
-    }
-  }
-
+  auto [CPU, Features] =
+      getTargetCPUAndFeatureAttrs(&Mod, KernelName, BinaryFormat::AMDGCN);
   std::unique_ptr<TargetMachine> TargetMachine(Target->createTargetMachine(
-      Mod.getTargetTriple(), CPU, Features, {}, llvm::Reloc::PIC_, std::nullopt,
+      Triple{TargetTriple}, CPU, Features, {}, llvm::Reloc::PIC_, std::nullopt,
       llvm::CodeGenOptLevel::Default));
 
   std::string AMDObj;
@@ -226,3 +180,34 @@ Translator::translateToAMDGCN(llvm::Module &Mod, JITContext &JITCtx,
   return &JITCtx.emplaceBinary(std::move(AMDObj), BinaryFormat::AMDGCN);
 #endif // JIT_SUPPORT_AMDGCN
 }
+
+std::pair<std::string, std::string> Translator::getTargetCPUAndFeatureAttrs(
+    const llvm::Module *M, const char *KernelName, BinaryFormat Format) {
+  assert((Format == BinaryFormat::AMDGCN || Format == BinaryFormat::PTX) &&
+         "Unexpected format found");
+  static const char *TARGET_CPU_ATTRIBUTE = "target-cpu";
+  static const char *TARGET_FEATURE_ATTRIBUTE = "target-features";
+  // Give priority to user specified values (through environment variables:
+  // SYCL_JIT_AMDGCN_PTX_TARGET_CPU and SYCL_JIT_AMDGCN_PTX_TARGET_FEATURES).
+  auto *KernelFunc = (M && KernelName) ? M->getFunction(KernelName) : nullptr;
+  auto CPUVal = ConfigHelper::get<option::JITTargetCPU>();
+  auto FeaturesVal = ConfigHelper::get<option::JITTargetFeatures>();
+  llvm::StringRef CPU{CPUVal.begin(), CPUVal.size()};
+  llvm::StringRef Features{FeaturesVal.begin(), FeaturesVal.size()};
+  if (CPU.empty()) {
+    // Set to the lowest tested target according to the GetStartedGuide, section
+    // "Build DPC++ toolchain with support for HIP AMD"
+    CPU = Format == BinaryFormat::AMDGCN ? "gfx90a" : "sm_50";
+    if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) {
+      CPU = KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString();
+    }
+  }
+  if (Features.empty()) {
+    Features = Format == BinaryFormat::PTX ? "+sm_50,+ptx76" : "";
+    if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) {
+      Features = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE)
+                     .getValueAsString();
+    }
+  }
+  return std::make_pair(std::string{CPU}, std::string{Features});
+}
diff --git a/sycl-jit/jit-compiler/lib/translation/Translation.h b/sycl-jit/jit-compiler/lib/translation/Translation.h
index 9c8dfdda5f9f5..78bf4ef8ac416 100644
--- a/sycl-jit/jit-compiler/lib/translation/Translation.h
+++ b/sycl-jit/jit-compiler/lib/translation/Translation.h
@@ -12,7 +12,6 @@
 #include "JITContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Error.h"
-#include <vector>
 
 namespace jit_compiler {
 
@@ -26,6 +25,9 @@ class Translator {
   translate(llvm::Module &Mod, JITContext &JITCtx, BinaryFormat Format,
             const char *KernelName = nullptr);
 
+  std::pair<std::string, std::string> static getTargetCPUAndFeatureAttrs(
+      const llvm::Module *M, const char *KernelName, BinaryFormat Format);
+
 private:
   /// Pair of address and size to represent a binary blob.
   using BinaryBlob = std::pair<BinaryAddress, size_t>;
diff --git a/sycl/source/detail/device_image_impl.hpp b/sycl/source/detail/device_image_impl.hpp
index b4956a82ce305..0cbc009cc88c7 100644
--- a/sycl/source/detail/device_image_impl.hpp
+++ b/sycl/source/detail/device_image_impl.hpp
@@ -8,6 +8,9 @@
 
 #pragma once
 
+#if SYCL_EXT_JIT_ENABLE
+#include "JITBinaryInfo.h"
+#endif // SYCL_EXT_JIT_ENABLE
 #include <detail/adapter_impl.hpp>
 #include <detail/compiler.hpp>
 #include <detail/context_impl.hpp>
@@ -702,6 +705,28 @@ class device_image_impl
     return MRTCBinInfo && MRTCBinInfo->MLanguage == Lang;
   }
 
+  static ::jit_compiler::BinaryFormat
+  getTargetFormat([[maybe_unused]] const backend Backend) {
+#if SYCL_EXT_JIT_ENABLE
+    switch (Backend) {
+    case backend::ext_oneapi_level_zero:
+    case backend::opencl:
+      return ::jit_compiler::BinaryFormat::SPIRV;
+    case backend::ext_oneapi_cuda:
+      return ::jit_compiler::BinaryFormat::PTX;
+    case backend::ext_oneapi_hip:
+      return ::jit_compiler::BinaryFormat::AMDGCN;
+    default:
+      throw sycl::exception(
+          sycl::make_error_code(sycl::errc::invalid),
+          "Backend does not support kernel_compiler extension");
+    }
+#else
+    throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
+                          "JIT not supported");
+#endif // SYCL_EXT_JIT_ENABLE
+  }
+
   std::vector<std::shared_ptr<device_image_impl>> buildFromSource(
       const std::vector<device> &Devices,
       const std::vector<sycl::detail::string_view> &BuildOptions,
@@ -732,9 +757,12 @@ class device_image_impl
       }
     }
 
-    if (MRTCBinInfo->MLanguage == syclex::source_language::sycl)
+    if (MRTCBinInfo->MLanguage == syclex::source_language::sycl) {
+      const auto Format = getTargetFormat(MContext.get_backend());
       return createSYCLImages(Devices, bundle_state::executable, BuildOptions,
-                              LogPtr, RegisteredKernelNames, OutDeviceBins);
+                              LogPtr, RegisteredKernelNames, OutDeviceBins,
+                              Format);
+    }
 
     std::vector<ur_device_handle_t> DeviceVec;
     DeviceVec.reserve(Devices.size());
@@ -822,8 +850,10 @@ class device_image_impl
                               "device does not support source language");
       }
     }
+    const auto Format = getTargetFormat(MContext.get_backend());
     return createSYCLImages(Devices, bundle_state::object, CompileOptions,
-                            LogPtr, RegisteredKernelNames, OutDeviceBins);
+                            LogPtr, RegisteredKernelNames, OutDeviceBins,
+                            Format);
   }
 
 private:
@@ -995,8 +1025,8 @@ class device_image_impl
       const std::vector<sycl::detail::string_view> &Options,
       std::string *LogPtr,
       const std::vector<sycl::detail::string_view> &RegisteredKernelNames,
-      std::vector<std::shared_ptr<ManagedDeviceBinaries>> &OutDeviceBins)
-      const {
+      std::vector<std::shared_ptr<ManagedDeviceBinaries>> &OutDeviceBins,
+      ::jit_compiler::BinaryFormat Format) const {
     assert(MRTCBinInfo);
     assert(MRTCBinInfo->MLanguage == syclex::source_language::sycl);
     assert(std::holds_alternative<std::string>(MBinImage));
@@ -1026,7 +1056,7 @@ class device_image_impl
 
     auto [Binaries, Prefix] = syclex::detail::SYCL_JIT_Compile(
         RegisteredKernelNames.empty() ? SourceStr : SourceExt.str(),
-        MRTCBinInfo->MIncludePairs, Options, LogPtr);
+        MRTCBinInfo->MIncludePairs, Options, LogPtr, Format);
 
     auto &PM = detail::ProgramManager::getInstance();
 
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index 189bd3d145309..2884e25399e5f 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -263,6 +263,7 @@ sycl_device_binaries jit_compiler::createDeviceBinaries(
       Binary.addOffloadEntry(std::move(Entry));
     }
 
+    bool FinalizationTagAdded = false;
     for (const auto &FPS : DevImgInfo.Properties) {
       bool IsDeviceGlobalsPropSet =
           FPS.Name == __SYCL_PROPERTY_SET_SYCL_DEVICE_GLOBALS;
@@ -279,18 +280,50 @@ sycl_device_binaries jit_compiler::createDeviceBinaries(
               sycl_property_type::SYCL_PROPERTY_TYPE_BYTE_ARRAY});
         }
       }
+      if (FPS.Name == __SYCL_PROPERTY_SET_PROGRAM_METADATA &&
+          DevImgInfo.BinaryInfo.Format ==
+              ::jit_compiler::BinaryFormat::AMDGCN) {
+        PropSet.addProperty(PropertyContainer{
+            __SYCL_PROGRAM_METADATA_TAG_NEED_FINALIZATION, 1});
+        FinalizationTagAdded = true;
+      }
       Binary.addProperty(std::move(PropSet));
 
       Binary.setCompileOptions(BundleInfo.CompileOptions.c_str());
     }
 
-    Collection->addDeviceBinary(std::move(Binary),
-                                DevImgInfo.BinaryInfo.BinaryStart,
-                                DevImgInfo.BinaryInfo.BinarySize,
-                                (DevImgInfo.BinaryInfo.AddressBits == 64)
-                                    ? __SYCL_DEVICE_BINARY_TARGET_SPIRV64
-                                    : __SYCL_DEVICE_BINARY_TARGET_SPIRV32,
-                                SYCL_DEVICE_BINARY_TYPE_SPIRV);
+    auto BinaryTarget = "";
+    auto Format = SYCL_DEVICE_BINARY_TYPE_NONE;
+    switch (DevImgInfo.BinaryInfo.Format) {
+    case ::jit_compiler::BinaryFormat::SPIRV:
+      BinaryTarget = DevImgInfo.BinaryInfo.AddressBits == 64
+                         ? __SYCL_DEVICE_BINARY_TARGET_SPIRV64
+                         : __SYCL_DEVICE_BINARY_TARGET_SPIRV32;
+      Format = SYCL_DEVICE_BINARY_TYPE_SPIRV;
+      break;
+    case ::jit_compiler::BinaryFormat::PTX:
+      BinaryTarget = __SYCL_DEVICE_BINARY_TARGET_LLVM_NVPTX64;
+      Format = SYCL_DEVICE_BINARY_TYPE_NONE;
+      break;
+    case ::jit_compiler::BinaryFormat::AMDGCN: {
+      BinaryTarget = __SYCL_DEVICE_BINARY_TARGET_LLVM_AMDGCN;
+      Format = SYCL_DEVICE_BINARY_TYPE_NONE;
+      // If the program had no properties, the tag needs to be added now.
+      if (!FinalizationTagAdded) {
+        PropertySetContainer ProgramMetadata{
+            __SYCL_PROPERTY_SET_PROGRAM_METADATA};
+        ProgramMetadata.addProperty(PropertyContainer{
+            __SYCL_PROGRAM_METADATA_TAG_NEED_FINALIZATION, 1});
+        Binary.addProperty(std::move(ProgramMetadata));
+      }
+      break;
+    }
+    default:
+      assert(false && "Unsupported format");
+    };
+    Collection->addDeviceBinary(
+        std::move(Binary), DevImgInfo.BinaryInfo.BinaryStart,
+        DevImgInfo.BinaryInfo.BinarySize, BinaryTarget, Format);
   }
 
   sycl_device_binaries Binaries = Collection->getPIDeviceStruct();
@@ -311,7 +344,26 @@ void jit_compiler::destroyDeviceBinaries(sycl_device_binaries Binaries) {
 std::pair<sycl_device_binaries, std::string> jit_compiler::compileSYCL(
     const std::string &CompilationID, const std::string &SYCLSource,
     const std::vector<std::pair<std::string, std::string>> &IncludePairs,
-    const std::vector<std::string> &UserArgs, std::string *LogPtr) {
+    const std::vector<std::string> &UserArgs, std::string *LogPtr,
+    ::jit_compiler::BinaryFormat Format) {
+  if (Format == ::jit_compiler::BinaryFormat::PTX ||
+      Format == ::jit_compiler::BinaryFormat::AMDGCN) {
+    // If present, set-up the config with env variables describing CPU and
+    // features.
+    auto SetUpOption = [](const std::string &Value) {
+      ::jit_compiler::JITEnvVar Option(Value.begin(), Value.end());
+      return Option;
+    };
+    ::jit_compiler::JITEnvVar TargetCPUOpt = SetUpOption(
+        detail::SYCLConfig<detail::SYCL_JIT_AMDGCN_PTX_TARGET_CPU>::get());
+    this->AddToConfigHandle(
+        ::jit_compiler::option::JITTargetCPU::set(TargetCPUOpt));
+    ::jit_compiler::JITEnvVar TargetFeaturesOpt = SetUpOption(
+        detail::SYCLConfig<detail::SYCL_JIT_AMDGCN_PTX_TARGET_FEATURES>::get());
+    this->AddToConfigHandle(
+        ::jit_compiler::option::JITTargetFeatures::set(TargetFeaturesOpt));
+  }
+
   auto appendToLog = [LogPtr](const char *Msg) {
     if (LogPtr) {
       LogPtr->append(Msg);
@@ -339,7 +391,7 @@ std::pair<sycl_device_binaries, std::string> jit_compiler::compileSYCL(
   std::vector<char> CachedIR;
   if (PersistentDeviceCodeCache::isEnabled()) {
     auto Result =
-        CalculateHashHandle(SourceFile, IncludeFilesView, UserArgsView);
+        CalculateHashHandle(SourceFile, IncludeFilesView, UserArgsView, Format);
 
     if (Result.failed()) {
       appendToLog(Result.getPreprocLog());
@@ -349,8 +401,9 @@ std::pair<sycl_device_binaries, std::string> jit_compiler::compileSYCL(
     }
   }
 
-  auto Result = CompileSYCLHandle(SourceFile, IncludeFilesView, UserArgsView,
-                                  CachedIR, /*SaveIR=*/!CacheKey.empty());
+  auto Result =
+      CompileSYCLHandle(SourceFile, IncludeFilesView, UserArgsView, CachedIR,
+                        /*SaveIR=*/!CacheKey.empty(), Format);
 
   const char *BuildLog = Result.getBuildLog();
   appendToLog(BuildLog);
diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp
index 31f87cd2be7e0..ac3ebf7d73fa4 100644
--- a/sycl/source/detail/jit_compiler.hpp
+++ b/sycl/source/detail/jit_compiler.hpp
@@ -47,7 +47,8 @@ class jit_compiler {
   std::pair<sycl_device_binaries, std::string> compileSYCL(
       const std::string &CompilationID, const std::string &SYCLSource,
       const std::vector<std::pair<std::string, std::string>> &IncludePairs,
-      const std::vector<std::string> &UserArgs, std::string *LogPtr);
+      const std::vector<std::string> &UserArgs, std::string *LogPtr,
+      ::jit_compiler::BinaryFormat Format);
 
   void destroyDeviceBinaries(sycl_device_binaries Binaries);
 
diff --git a/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.cpp b/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.cpp
index 93ec94a8ac328..418842fdc48ec 100644
--- a/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.cpp
+++ b/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.cpp
@@ -44,7 +44,8 @@ std::pair<sycl_device_binaries, std::string> SYCL_JIT_Compile(
     [[maybe_unused]] const std::string &SYCLSource,
     [[maybe_unused]] const include_pairs_t &IncludePairs,
     [[maybe_unused]] const std::vector<sycl::detail::string_view> &UserArgs,
-    [[maybe_unused]] std::string *LogPtr) {
+    [[maybe_unused]] std::string *LogPtr,
+    [[maybe_unused]] ::jit_compiler::BinaryFormat Format) {
 #if SYCL_EXT_JIT_ENABLE
   static std::atomic_uintptr_t CompilationCounter;
   std::string CompilationID = "rtc_" + std::to_string(CompilationCounter++);
@@ -52,7 +53,7 @@ std::pair<sycl_device_binaries, std::string> SYCL_JIT_Compile(
   for (const sycl::detail::string_view UserArg : UserArgs)
     UserArgStrings.push_back(UserArg.data());
   return sycl::detail::jit_compiler::get_instance().compileSYCL(
-      CompilationID, SYCLSource, IncludePairs, UserArgStrings, LogPtr);
+      CompilationID, SYCLSource, IncludePairs, UserArgStrings, LogPtr, Format);
 #else
   throw sycl::exception(sycl::errc::build,
                         "kernel_compiler via sycl-jit is not available");
diff --git a/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.hpp b/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.hpp
index c088122a33ff5..0dfaf63a1d8da 100644
--- a/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.hpp
+++ b/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.hpp
@@ -8,6 +8,9 @@
 
 #pragma once
 
+#if SYCL_EXT_JIT_ENABLE
+#include "JITBinaryInfo.h"
+#endif // SYCL_EXT_JIT_ENABLE
 #include <sycl/detail/defines_elementary.hpp>
 #include <sycl/detail/export.hpp> // __SYCL_EXPORT
 #include <sycl/detail/string_view.hpp>
@@ -39,7 +42,7 @@ userArgsAsString(const std::vector<sycl::detail::string_view> &UserArguments);
 std::pair<sycl_device_binaries, std::string>
 SYCL_JIT_Compile(const std::string &Source, const include_pairs_t &IncludePairs,
                  const std::vector<sycl::detail::string_view> &UserArgs,
-                 std::string *LogPtr);
+                 std::string *LogPtr, ::jit_compiler::BinaryFormat Format);
 
 void SYCL_JIT_Destroy(sycl_device_binaries Binaries);
 
diff --git a/sycl/source/kernel_bundle.cpp b/sycl/source/kernel_bundle.cpp
index fcc380c9a537b..c18bdbc46a62c 100644
--- a/sycl/source/kernel_bundle.cpp
+++ b/sycl/source/kernel_bundle.cpp
@@ -409,8 +409,10 @@ bool is_source_kernel_bundle_supported(
     const std::vector<device_impl *> &DeviceImplVec) {
   backend BE = DeviceImplVec[0]->getBackend();
   // Support is limited to the opencl and level_zero backends.
-  bool BE_Acceptable = (BE == sycl::backend::ext_oneapi_level_zero) ||
-                       (BE == sycl::backend::opencl);
+  bool BE_Acceptable = BE == sycl::backend::ext_oneapi_level_zero ||
+                       BE == sycl::backend::opencl ||
+                       BE == sycl::backend::ext_oneapi_hip ||
+                       BE == sycl::backend::ext_oneapi_cuda;
   if (!BE_Acceptable)
     return false;
 
diff --git a/sycl/test-e2e/KernelCompiler/sycl.cpp b/sycl/test-e2e/KernelCompiler/sycl.cpp
index 3095d570be946..13003bea98a62 100644
--- a/sycl/test-e2e/KernelCompiler/sycl.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl.cpp
@@ -6,11 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-// REQUIRES: (opencl || level_zero)
 // REQUIRES: aspect-usm_device_allocations
 
 // RUN: %{build} -o %t.out
-// RUN: %{l0_leak_check} %{run} %t.out
+// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out
 
 // XFAIL: preview-mode && run-mode
 // XFAIL-TRACKER: https://github.com/intel/llvm/issues/18390
@@ -108,15 +107,14 @@ auto constexpr DeviceLibrariesSource = R"===(
 #include <sycl/sycl.hpp>
 #include <cmath>
 #include <complex>
-#include <sycl/ext/intel/math.hpp>
 
 extern "C" SYCL_EXTERNAL 
 SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(sycl::ext::oneapi::experimental::single_task_kernel)
 void device_libs_kernel(float *ptr) {
   // Extension list: llvm/lib/SYCLLowerIR/SYCLDeviceLibReqMask.cpp
 
-  // cl_intel_devicelib_assert is not available for opencl:gpu; skip testing it.
-  // Only test the fp32 variants of complex, math and imf to keep this test
+  // cl_intel_devicelib_assert is not available for opencl:gpu; skip testing
+  // it. Only test the fp32 variants of complex and math to keep this test
   // device-agnostic.
   
   // cl_intel_devicelib_math
@@ -128,14 +126,8 @@ void device_libs_kernel(float *ptr) {
   // cl_intel_devicelib_cstring
   ptr[2] = memcmp(ptr + 2, ptr + 2, sizeof(float));
 
-  // cl_intel_devicelib_imf
-  ptr[3] = sycl::ext::intel::math::sqrt(ptr[3] * 2);
-
-  // cl_intel_devicelib_imf_bf16
-  ptr[4] = sycl::ext::intel::math::float2bfloat16(ptr[4] * 0.5f);
-
   // cl_intel_devicelib_bfloat16
-  ptr[5] = sycl::ext::oneapi::bfloat16{ptr[5] / 0.25f};
+  ptr[3] = sycl::ext::oneapi::bfloat16{ptr[3] / 0.25f};
 }
 )===";
 
@@ -356,7 +348,7 @@ int test_device_libraries(sycl::queue q) {
   exe_kb kbExe = syclex::build(kbSrc);
 
   sycl::kernel k = kbExe.ext_oneapi_get_kernel("device_libs_kernel");
-  constexpr size_t nElem = 6;
+  constexpr size_t nElem = 4;
   float *ptr = sycl::malloc_shared<float>(nElem, q);
   for (int i = 0; i < nElem; ++i)
     ptr[i] = 1.0f;
diff --git a/sycl/test-e2e/KernelCompiler/sycl_basic.cpp b/sycl/test-e2e/KernelCompiler/sycl_basic.cpp
index 5832d5ff50685..fbb7768fd8ac8 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_basic.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_basic.cpp
@@ -6,11 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-// REQUIRES: (opencl || level_zero)
 // REQUIRES: aspect-usm_device_allocations
 
 // RUN: %{build} -o %t.out
-// RUN: %{l0_leak_check} %{run} %t.out
+// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out
 
 #include <sycl/detail/core.hpp>
 #include <sycl/kernel_bundle.hpp>
diff --git a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp
index 405d97ce15e96..0cffec11ffa47 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp
@@ -7,6 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 // REQUIRES: (opencl || level_zero)
+// Unlike other RTC tests, don't run this one on Cuda/HIP. Eviction mechanism
+// is based on the size of compiled kernels, which in turns depends on the
+// target. Don't run eviction check for CUDA/HIP, so that we don't have to find
+// a magic number that works for all binaries (and by definition is flaky).
+
 // REQUIRES: aspect-usm_device_allocations
 
 // DEFINE: %{cache_vars} = env SYCL_CACHE_PERSISTENT=1 SYCL_CACHE_TRACE=7 SYCL_CACHE_DIR=%t/cache_dir
diff --git a/sycl/test-e2e/KernelCompiler/sycl_context_error.cpp b/sycl/test-e2e/KernelCompiler/sycl_context_error.cpp
index e72c733fdbf9f..7d41f3a3f79d0 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_context_error.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_context_error.cpp
@@ -6,10 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-// REQUIRES: (opencl || level_zero)
-
 // RUN: %{build} -o %t.out
-// RUN: %{run-unfiltered-devices} %t.out
+// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{run-unfiltered-devices} %t.out
 
 #include <sycl/detail/core.hpp>
 #include <sycl/kernel_bundle.hpp>
diff --git a/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp b/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp
index 2296aba1d9504..ba8b08584df81 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp
@@ -6,14 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-// REQUIRES: (opencl || level_zero)
 // REQUIRES: aspect-usm_device_allocations
 
 // UNSUPPORTED: opencl && gpu
 // UNSUPPORTED-TRACKER: GSD-4287
 
 // RUN: %{build} -o %t.out
-// RUN: %{l0_leak_check} %{run} %t.out
+// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out
 
 #include <sycl/detail/core.hpp>
 #include <sycl/kernel_bundle.hpp>
diff --git a/sycl/test-e2e/KernelCompiler/sycl_export_registration.cpp b/sycl/test-e2e/KernelCompiler/sycl_export_registration.cpp
index 410f4f3e8d753..4ca1f5296dc7b 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_export_registration.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_export_registration.cpp
@@ -8,6 +8,8 @@
 
 // REQUIRES: (opencl || level_zero)
 
+// Note linking is not supported on CUDA/HIP.
+
 // -- Test for a case where a kernel bundle is built that exports a symbol and
 // -- other kernel bundles that uses it are compiled/linked without it. These
 // -- cases should fail due to unresolved symbols, rather than picking up the
diff --git a/sycl/test-e2e/KernelCompiler/sycl_imf.cpp b/sycl/test-e2e/KernelCompiler/sycl_imf.cpp
new file mode 100644
index 0000000000000..142cc2160ac52
--- /dev/null
+++ b/sycl/test-e2e/KernelCompiler/sycl_imf.cpp
@@ -0,0 +1,79 @@
+//==--- sycl_imf.cpp --- kernel_compiler extension imf tests ---------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: aspect-usm_device_allocations
+// REQUIRES: (opencl || level_zero)
+
+// UNSUPPORTED: accelerator
+// UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there.
+
+// RUN: %{build} -o %t.out
+// RUN: %{l0_leak_check} %{run} %t.out
+
+// XFAIL: preview-mode && run-mode
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/18390
+
+#include <sycl/detail/core.hpp>
+#include <sycl/kernel_bundle.hpp>
+#include <sycl/usm.hpp>
+
+auto constexpr IMFSources = R"===(
+#include <sycl/sycl.hpp>
+#include <cmath>
+#include <sycl/ext/intel/math.hpp>
+
+extern "C" SYCL_EXTERNAL 
+SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(sycl::ext::oneapi::experimental::single_task_kernel)
+void imf_kernel(float *ptr) {
+  // cl_intel_devicelib_imf
+  ptr[0] = sycl::ext::intel::math::sqrt(ptr[0] * 2);
+
+  // cl_intel_devicelib_imf_bf16
+  ptr[1] = sycl::ext::intel::math::float2bfloat16(ptr[1] * 0.5f);
+}
+)===";
+
+int main() {
+#ifdef SYCL_EXT_ONEAPI_KERNEL_COMPILER
+  sycl::queue q;
+  namespace syclex = sycl::ext::oneapi::experimental;
+  using source_kb = sycl::kernel_bundle<sycl::bundle_state::ext_oneapi_source>;
+  using exe_kb = sycl::kernel_bundle<sycl::bundle_state::executable>;
+
+  sycl::context ctx = q.get_context();
+
+  source_kb kbSrc = syclex::create_kernel_bundle_from_source(
+      ctx, syclex::source_language::sycl, IMFSources);
+  exe_kb kbExe = syclex::build(kbSrc);
+
+  sycl::kernel k = kbExe.ext_oneapi_get_kernel("imf_kernel");
+  constexpr size_t nElem = 2;
+  float *ptr = sycl::malloc_shared<float>(nElem, q);
+  for (int i = 0; i < nElem; ++i)
+    ptr[i] = 1.0f;
+
+  q.submit([&](sycl::handler &cgh) {
+    cgh.set_arg(0, ptr);
+    cgh.single_task(k);
+  });
+  q.wait_and_throw();
+
+  // Check that the kernel was executed. Given the {1.0, 1.0} input,
+  // the expected result is approximately {1.41, 0.5}.
+  for (unsigned i = 0; i < nElem; ++i) {
+    std::cout << ptr[i] << ' ';
+    assert(ptr[i] != 1.0f);
+  }
+  std::cout << std::endl;
+
+  sycl::free(ptr, q);
+#else
+  static_assert(false, "Kernel Compiler feature test macro undefined");
+#endif
+  return 0;
+}
diff --git a/sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp b/sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp
index 0cc50413dbad4..d073390a3d6a9 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp
@@ -6,15 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-// REQUIRES: (opencl || level_zero)
 // REQUIRES: aspect-usm_device_allocations
 
 // RUN: %{build} -o %t.out
-// RUN: %{run} %t.out %S | FileCheck %s --check-prefixes=CHECK,CHECK-NOCWD
+// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{run} %t.out %S | FileCheck %s --check-prefixes=CHECK,CHECK-NOCWD
 
 // COM: Run test again in a directory that contains a different version of
 //      `header1.hpp`
-// RUN: cd %S/include/C ; %{run} %t.out %S | FileCheck %s --check-prefixes=CHECK,CHECK-CWD
+// RUN: cd %S/include/C ; %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{run} %t.out %S | FileCheck %s --check-prefixes=CHECK,CHECK-CWD
 
 #include <sycl/detail/core.hpp>
 #include <sycl/kernel_bundle.hpp>
diff --git a/sycl/test-e2e/KernelCompiler/sycl_join.cpp b/sycl/test-e2e/KernelCompiler/sycl_join.cpp
index 33cf81dd6bdcb..1cc0c8e8d2d8e 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_join.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_join.cpp
@@ -6,12 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-// REQUIRES: (opencl || level_zero)
 // REQUIRES: aspect-usm_shared_allocations
 
 // RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-// RUN: %{l0_leak_check} %{run} %t.out
+// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{run} %t.out
+// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out
 
 // XFAIL: preview-mode && run-mode
 // XFAIL-TRACKER: https://github.com/intel/llvm/issues/18390
diff --git a/sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp b/sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp
index 94864926af4c4..7d38e44cab6c2 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp
@@ -6,11 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-// REQUIRES: (opencl || level_zero)
 // REQUIRES: aspect-usm_device_allocations
 
 // RUN: %{build} -o %t.out
-// RUN: env SYCL_UR_TRACE=-1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s
+// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} env SYCL_UR_TRACE=-1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s
 
 #include <sycl/detail/core.hpp>
 #include <sycl/kernel_bundle.hpp>
@@ -48,14 +47,15 @@ int test_lifetimes() {
       ctx, syclex::source_language::sycl, SYCLSource);
 
   exe_kb kbExe1 = syclex::build(kbSrc);
-  // CHECK: urProgramCreateWithIL{{.*}}phProgram{{.*}}([[PROG1:.*]]))
+  // Cuda/Hip programs will be created with Binary, spirv IL.
+  // CHECK: urProgramCreateWith{{IL|Binary}}{{.*}}phProgram{{.*}}([[PROG1:.*]]))
 
   {
     std::cout << "Scope1\n";
     // CHECK: Scope1
     exe_kb kbExe2 = syclex::build(kbSrc);
     // kbExe2 goes out of scope; its kernels are removed from program mananager.
-    // CHECK: urProgramCreateWithIL{{.*}}phProgram{{.*}}([[PROG2:.*]]))
+    // CHECK: urProgramCreateWith{{IL|Binary}}{{.*}}phProgram{{.*}}([[PROG2:.*]]))
     // CHECK: urProgramRelease{{.*}}[[PROG2]]
   }
   std::cout << "End Scope1\n";
diff --git a/sycl/test-e2e/KernelCompiler/sycl_link.cpp b/sycl/test-e2e/KernelCompiler/sycl_link.cpp
index 7187488d0f0dc..1453e02b773dc 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_link.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_link.cpp
@@ -12,6 +12,8 @@
 // -- Test for a simple linking case with source files compiled from SYCL source
 // -- at runtime.
 
+// Note linking is not supported on CUDA/HIP.
+
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // RUN: %{l0_leak_check} %{run} %t.out
diff --git a/sycl/test-e2e/KernelCompiler/sycl_link_common_dep.cpp b/sycl/test-e2e/KernelCompiler/sycl_link_common_dep.cpp
index b1d66710c2fa2..f639a9bb3a337 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_link_common_dep.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_link_common_dep.cpp
@@ -9,6 +9,8 @@
 // REQUIRES: (opencl || level_zero)
 // REQUIRES: aspect-usm_shared_allocations
 
+// Note linking is not supported on CUDA/HIP.
+
 // -- Test for linking where two kernels use the same imported symbols.
 
 // RUN: %{build} -o %t.out
diff --git a/sycl/test-e2e/KernelCompiler/sycl_link_common_dep_optional_feature.cpp b/sycl/test-e2e/KernelCompiler/sycl_link_common_dep_optional_feature.cpp
index 4976a9066bc3e..5183a927881e1 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_link_common_dep_optional_feature.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_link_common_dep_optional_feature.cpp
@@ -9,6 +9,8 @@
 // REQUIRES: (opencl || level_zero)
 // REQUIRES: aspect-usm_shared_allocations
 
+// Note linking is not supported on CUDA/HIP.
+
 // -- Test for linking where two kernels use the same imported symbols, but one
 // -- may not be supported on the device.
 
diff --git a/sycl/test-e2e/KernelCompiler/sycl_link_export_conflict.cpp b/sycl/test-e2e/KernelCompiler/sycl_link_export_conflict.cpp
index 4f81eeb684fe4..0739a5d9da00a 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_link_export_conflict.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_link_export_conflict.cpp
@@ -9,6 +9,8 @@
 // REQUIRES: (opencl || level_zero)
 // REQUIRES: aspect-usm_shared_allocations
 
+// Note linking is not supported on CUDA/HIP.
+
 // -- Test for a case where a kernel bundle with an exported symbol is compiled
 // -- before another kernel bundle using a different variant of the symbol.
 
diff --git a/sycl/test-e2e/KernelCompiler/sycl_link_kernel_conflict.cpp b/sycl/test-e2e/KernelCompiler/sycl_link_kernel_conflict.cpp
index 41b8c99c137a3..0636fbff37bf7 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_link_kernel_conflict.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_link_kernel_conflict.cpp
@@ -9,6 +9,8 @@
 // REQUIRES: (opencl || level_zero)
 // REQUIRES: aspect-usm_shared_allocations
 
+// Note linking is not supported on CUDA/HIP.
+
 // -- Test for the linking of two kernels with conflicting definitions of
 // -- kernels with the same name.
 
diff --git a/sycl/test-e2e/KernelCompiler/sycl_namespaces.cpp b/sycl/test-e2e/KernelCompiler/sycl_namespaces.cpp
index 89ad5f5e67c40..29c54161506f9 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_namespaces.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_namespaces.cpp
@@ -6,11 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-// REQUIRES: (opencl || level_zero)
 // REQUIRES: aspect-usm_device_allocations
 
 // RUN: %{build} -o %t.out
-// RUN: %{l0_leak_check} %{run} %t.out
+// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out
 
 #include <sycl/detail/core.hpp>
 #include <sycl/kernel_bundle.hpp>
diff --git a/sycl/test-e2e/KernelCompiler/sycl_overload.cpp b/sycl/test-e2e/KernelCompiler/sycl_overload.cpp
index 2e7b5fee432a5..a1b6cb787d030 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_overload.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_overload.cpp
@@ -6,11 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-// REQUIRES: (opencl || level_zero)
 // REQUIRES: aspect-usm_device_allocations
 
 // RUN: %{build} -o %t.out
-// RUN: %{l0_leak_check} %{run} %t.out
+// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out
 
 #include <sycl/detail/core.hpp>
 #include <sycl/kernel_bundle.hpp>
diff --git a/sycl/test-e2e/KernelCompiler/sycl_time_trace.cpp b/sycl/test-e2e/KernelCompiler/sycl_time_trace.cpp
index d3916dfc3803b..4b1a04d1bcb8d 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_time_trace.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_time_trace.cpp
@@ -5,9 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: (opencl || level_zero)
+
 // RUN: %{build} -o %t.out
-// RUN: %{run} %t.out | FileCheck %s
+// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{run} %t.out | FileCheck %s
 
 #include <sycl/detail/core.hpp>
 #include <sycl/kernel_bundle.hpp>
diff --git a/sycl/test-e2e/format.py b/sycl/test-e2e/format.py
index 44da0ef1c52c4..ff2f8e36ecfb7 100644
--- a/sycl/test-e2e/format.py
+++ b/sycl/test-e2e/format.py
@@ -244,6 +244,7 @@ def execute(self, test, litConfig):
             )
             sycl_target_opts += hip_arch_opts
             substitutions.append(("%{hip_arch_opts}", hip_arch_opts))
+            substitutions.append(("%{amd_arch}", test.config.amd_arch))
         if (
             "target-spir" in build_targets
             and "spirv-backend" in test.config.available_features
diff --git a/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp b/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp
index 4292342068187..7f3757fc70624 100644
--- a/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp
+++ b/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp
@@ -6,7 +6,7 @@
 // CHECK-DAG: README.md
 // CHECK-DAG: lit.cfg.py
 //
-// CHECK-NUM-MATCHES: 28
+// CHECK-NUM-MATCHES: 29
 //
 // This test verifies that `<sycl/sycl.hpp>` isn't used in E2E tests. Instead,
 // fine-grained includes should used, see