diff --git a/sycl-jit/jit-compiler/CMakeLists.txt b/sycl-jit/jit-compiler/CMakeLists.txt
index fabf64546e9da..9e97d2c8ccfdb 100644
--- a/sycl-jit/jit-compiler/CMakeLists.txt
+++ b/sycl-jit/jit-compiler/CMakeLists.txt
@@ -60,6 +60,7 @@ target_include_directories(sycl-jit
   ${LLVM_MAIN_INCLUDE_DIR}
   ${LLVM_SPIRV_INCLUDE_DIRS}
   ${LLVM_EXTERNAL_CLANG_SOURCE_DIR}/include
+  ${LLVM_EXTERNAL_CLANG_SOURCE_DIR}/lib
   ${CMAKE_BINARY_DIR}/tools/clang/include
 )
 target_include_directories(sycl-jit
diff --git a/sycl-jit/jit-compiler/include/RTC.h b/sycl-jit/jit-compiler/include/RTC.h
index a1a39d5b50ad3..6a690df459646 100644
--- a/sycl-jit/jit-compiler/include/RTC.h
+++ b/sycl-jit/jit-compiler/include/RTC.h
@@ -176,10 +176,11 @@ class RTCResult {
 
 /// Calculates a BLAKE3 hash of the pre-processed source string described by
 /// \p SourceFile (considering any additional \p IncludeFiles) and the
-/// concatenation of the \p UserArgs.
+/// concatenation of the \p UserArgs, for a given \p Format.
 JIT_EXPORT_SYMBOL RTCHashResult calculateHash(InMemoryFile SourceFile,
                                               View<InMemoryFile> IncludeFiles,
-                                              View<const char *> UserArgs);
+                                              View<const char *> UserArgs,
+                                              BinaryFormat Format);
 
 /// Compiles, links against device libraries, and finalizes the device code in
 /// the source string described by \p SourceFile, considering any additional \p
@@ -191,10 +192,14 @@ JIT_EXPORT_SYMBOL RTCHashResult calculateHash(InMemoryFile SourceFile,
 ///
 /// If \p SaveIR is true and \p CachedIR is empty, the LLVM module obtained from
 /// the frontend invocation is wrapped in bitcode format in the result object.
+///
+/// \p BinaryFormat describes the desired format of the compilation - which
+/// corresponds to the backend that is being targeted.
 JIT_EXPORT_SYMBOL RTCResult compileSYCL(InMemoryFile SourceFile,
                                         View<InMemoryFile> IncludeFiles,
                                         View<const char *> UserArgs,
-                                        View<char> CachedIR, bool SaveIR);
+                                        View<char> CachedIR, bool SaveIR,
+                                        BinaryFormat Format);
 
 /// Requests that the JIT binary referenced by \p Address is deleted from the
 /// `JITContext`.
diff --git a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp
index bc5d638a25255..ade0161991b2d 100644
--- a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp
+++ b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.cpp
@@ -8,11 +8,17 @@
 
 #include "DeviceCompilation.h"
 #include "ESIMD.h"
+#include "JITBinaryInfo.h"
+#include "translation/Translation.h"
 
+#include <Driver/ToolChains/AMDGPU.h>
+#include <Driver/ToolChains/Cuda.h>
+#include <Driver/ToolChains/LazyDetector.h>
 #include <clang/Basic/DiagnosticDriver.h>
 #include <clang/Basic/Version.h>
 #include <clang/CodeGen/CodeGenAction.h>
 #include <clang/Driver/Compilation.h>
+#include <clang/Driver/Driver.h>
 #include <clang/Driver/Options.h>
 #include <clang/Frontend/ChainedDiagnosticConsumer.h>
 #include <clang/Frontend/CompilerInstance.h>
@@ -178,7 +184,8 @@ class RTCToolActionBase : public ToolAction {
     assert(!hasExecuted() && "Action should only be invoked on a single file");
 
     // Create a compiler instance to handle the actual work.
-    CompilerInstance Compiler(std::move(Invocation), std::move(PCHContainerOps));
+    CompilerInstance Compiler(std::move(Invocation),
+                              std::move(PCHContainerOps));
     Compiler.setFileManager(Files);
     // Suppress summary with number of warnings and errors being printed to
     // stdout.
@@ -312,7 +319,7 @@ class LLVMDiagnosticWrapper : public llvm::DiagnosticHandler {
 } // anonymous namespace
 
 static void adjustArgs(const InputArgList &UserArgList,
-                       const std::string &DPCPPRoot,
+                       const std::string &DPCPPRoot, BinaryFormat Format,
                        SmallVectorImpl<std::string> &CommandLine) {
   DerivedArgList DAL{UserArgList};
   const auto &OptTable = getDriverOptTable();
@@ -325,6 +332,23 @@ static void adjustArgs(const InputArgList &UserArgList,
   // unused argument warning.
   DAL.AddFlagArg(nullptr, OptTable.getOption(OPT_Qunused_arguments));
 
+  if (Format == BinaryFormat::PTX || Format == BinaryFormat::AMDGCN) {
+    auto [CPU, Features] =
+        Translator::getTargetCPUAndFeatureAttrs(nullptr, "", Format);
+    (void)Features;
+    if (Format == BinaryFormat::AMDGCN) {
+      DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_fsycl_targets_EQ),
+                       "amdgcn-amd-amdhsa");
+      DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_Xsycl_backend_EQ),
+                       "amdgcn-amd-amdhsa");
+      DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_offload_arch_EQ), CPU);
+    } else {
+      DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_fsycl_targets_EQ),
+                       "nvptx64-nvidia-cuda");
+      DAL.AddFlagArg(nullptr, OptTable.getOption(OPT_Xsycl_backend));
+      DAL.AddJoinedArg(nullptr, OptTable.getOption(OPT_cuda_gpu_arch_EQ), CPU);
+    }
+  }
   ArgStringList ASL;
   for_each(DAL, [&DAL, &ASL](Arg *A) { A->render(DAL, ASL); });
   for_each(UserArgList,
@@ -361,10 +385,9 @@ static void setupTool(ClangTool &Tool, const std::string &DPCPPRoot,
       });
 }
 
-Expected<std::string>
-jit_compiler::calculateHash(InMemoryFile SourceFile,
-                            View<InMemoryFile> IncludeFiles,
-                            const InputArgList &UserArgList) {
+Expected<std::string> jit_compiler::calculateHash(
+    InMemoryFile SourceFile, View<InMemoryFile> IncludeFiles,
+    const InputArgList &UserArgList, BinaryFormat Format) {
   TimeTraceScope TTS{"calculateHash"};
 
   const std::string &DPCPPRoot = getDPCPPRoot();
@@ -373,7 +396,7 @@ jit_compiler::calculateHash(InMemoryFile SourceFile,
   }
 
   SmallVector<std::string> CommandLine;
-  adjustArgs(UserArgList, DPCPPRoot, CommandLine);
+  adjustArgs(UserArgList, DPCPPRoot, Format, CommandLine);
 
   FixedCompilationDatabase DB{".", CommandLine};
   ClangTool Tool{DB, {SourceFile.Path}};
@@ -399,11 +422,10 @@ jit_compiler::calculateHash(InMemoryFile SourceFile,
   return createStringError("Calculating source hash failed");
 }
 
-Expected<ModuleUPtr>
-jit_compiler::compileDeviceCode(InMemoryFile SourceFile,
-                                View<InMemoryFile> IncludeFiles,
-                                const InputArgList &UserArgList,
-                                std::string &BuildLog, LLVMContext &Context) {
+Expected<ModuleUPtr> jit_compiler::compileDeviceCode(
+    InMemoryFile SourceFile, View<InMemoryFile> IncludeFiles,
+    const InputArgList &UserArgList, std::string &BuildLog,
+    LLVMContext &Context, BinaryFormat Format) {
   TimeTraceScope TTS{"compileDeviceCode"};
 
   const std::string &DPCPPRoot = getDPCPPRoot();
@@ -412,7 +434,7 @@ jit_compiler::compileDeviceCode(InMemoryFile SourceFile,
   }
 
   SmallVector<std::string> CommandLine;
-  adjustArgs(UserArgList, DPCPPRoot, CommandLine);
+  adjustArgs(UserArgList, DPCPPRoot, Format, CommandLine);
 
   FixedCompilationDatabase DB{".", CommandLine};
   ClangTool Tool{DB, {SourceFile.Path}};
@@ -430,12 +452,22 @@ jit_compiler::compileDeviceCode(InMemoryFile SourceFile,
   return createStringError(BuildLog);
 }
 
-// This function is a simplified copy of the device library selection process in
-// `clang::driver::tools::SYCL::getDeviceLibraries`, assuming a SPIR-V target
-// (no AoT, no third-party GPUs, no native CPU). Keep in sync!
+// This function is a simplified copy of the device library selection process
+// in `clang::driver::tools::SYCL::getDeviceLibraries`, assuming a SPIR-V, or
+// GPU targets (no AoT, no native CPU). Keep in sync!
 static bool getDeviceLibraries(const ArgList &Args,
                                SmallVectorImpl<std::string> &LibraryList,
-                               DiagnosticsEngine &Diags) {
+                               DiagnosticsEngine &Diags, BinaryFormat Format) {
+  // For CUDA/HIP we only need devicelib, early exit here.
+  if (Format == BinaryFormat::PTX) {
+    LibraryList.push_back(
+        Args.MakeArgString("devicelib-nvptx64-nvidia-cuda.bc"));
+    return false;
+  } else if (Format == BinaryFormat::AMDGCN) {
+    LibraryList.push_back(Args.MakeArgString("devicelib-amdgcn-amd-amdhsa.bc"));
+    return false;
+  }
+
   struct DeviceLibOptInfo {
     StringRef DeviceLibName;
     StringRef DeviceLibOption;
@@ -540,7 +572,8 @@ static Expected<ModuleUPtr> loadBitcodeLibrary(StringRef LibPath,
 
 Error jit_compiler::linkDeviceLibraries(llvm::Module &Module,
                                         const InputArgList &UserArgList,
-                                        std::string &BuildLog) {
+                                        std::string &BuildLog,
+                                        BinaryFormat Format) {
   TimeTraceScope TTS{"linkDeviceLibraries"};
 
   const std::string &DPCPPRoot = getDPCPPRoot();
@@ -555,11 +588,29 @@ Error jit_compiler::linkDeviceLibraries(llvm::Module &Module,
                           /* ShouldOwnClient=*/false);
 
   SmallVector<std::string> LibNames;
-  bool FoundUnknownLib = getDeviceLibraries(UserArgList, LibNames, Diags);
+  const bool FoundUnknownLib =
+      getDeviceLibraries(UserArgList, LibNames, Diags, Format);
   if (FoundUnknownLib) {
     return createStringError("Could not determine list of device libraries: %s",
                              BuildLog.c_str());
   }
+  const bool IsCudaHIP =
+      Format == BinaryFormat::PTX || Format == BinaryFormat::AMDGCN;
+  if (IsCudaHIP) {
+    // Based on the OS and the format decide on the version of libspirv.
+    // NOTE: this will be problematic if cross-compiling between OSes.
+    std::string Libclc{"clc/"};
+    Libclc.append(
+#ifdef _WIN32
+        "remangled-l32-signed_char.libspirv-"
+#else
+        "remangled-l64-signed_char.libspirv-"
+#endif
+    );
+    Libclc.append(Format == BinaryFormat::PTX ? "nvptx64-nvidia-cuda.bc"
+                                              : "amdgcn-amd-amdhsa.bc");
+    LibNames.push_back(Libclc);
+  }
 
   LLVMContext &Context = Module.getContext();
   for (const std::string &LibName : LibNames) {
@@ -577,6 +628,58 @@ Error jit_compiler::linkDeviceLibraries(llvm::Module &Module,
     }
   }
 
+  // For GPU targets we need to link against vendor provided libdevice.
+  if (IsCudaHIP) {
+    Triple T{Module.getTargetTriple()};
+    Driver D{(Twine(DPCPPRoot) + "/bin/clang++").str(), T.getTriple(), Diags};
+    auto [CPU, Features] =
+        Translator::getTargetCPUAndFeatureAttrs(&Module, "", Format);
+    (void)Features;
+    // Helper lambda to link modules.
+    auto LinkInLib = [&](const StringRef LibDevice) -> Error {
+      ModuleUPtr LibDeviceModule;
+      if (auto Error = loadBitcodeLibrary(LibDevice, Context)
+                           .moveInto(LibDeviceModule)) {
+        return Error;
+      }
+      if (Linker::linkModules(Module, std::move(LibDeviceModule),
+                              Linker::LinkOnlyNeeded)) {
+        return createStringError("Unable to link libdevice: %s",
+                                 BuildLog.c_str());
+      }
+      return Error::success();
+    };
+    SmallVector<std::string, 12> LibDeviceFiles;
+    if (Format == BinaryFormat::PTX) {
+      // For NVPTX we can get away with CudaInstallationDetector.
+      LazyDetector<CudaInstallationDetector> CudaInstallation{D, T,
+                                                              UserArgList};
+      auto LibDevice = CudaInstallation->getLibDeviceFile(CPU);
+      if (LibDevice.empty()) {
+        return createStringError("Unable to find Cuda libdevice");
+      }
+      LibDeviceFiles.push_back(LibDevice);
+    } else {
+      // AMDGPU requires entire toolchain in order to provide all common bitcode
+      // libraries.
+      clang::driver::toolchains::ROCMToolChain TC(D, T, UserArgList);
+      auto CommonDeviceLibs = TC.getCommonDeviceLibNames(
+          UserArgList, CPU, Action::OffloadKind::OFK_SYCL, false);
+      if (CommonDeviceLibs.empty()) {
+        return createStringError("Unable to find ROCm common device libraries");
+      }
+      for (auto &Lib : CommonDeviceLibs) {
+        LibDeviceFiles.push_back(Lib.Path);
+      }
+    }
+    for (auto &LibDeviceFile : LibDeviceFiles) {
+      // llvm::Error converts to false on success.
+      if (auto Error = LinkInLib(LibDeviceFile)) {
+        return Error;
+      }
+    }
+  }
+
   return Error::success();
 }
 
diff --git a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.h b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.h
index 62da2cdb54bf1..aa1b19df0cc8f 100644
--- a/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.h
+++ b/sycl-jit/jit-compiler/lib/rtc/DeviceCompilation.h
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include "JITBinaryInfo.h"
 #include "RTC.h"
 
 #include <llvm/ADT/SmallVector.h>
@@ -24,16 +25,17 @@ using ModuleUPtr = std::unique_ptr<llvm::Module>;
 
 llvm::Expected<std::string>
 calculateHash(InMemoryFile SourceFile, View<InMemoryFile> IncludeFiles,
-              const llvm::opt::InputArgList &UserArgList);
+              const llvm::opt::InputArgList &UserArgList, BinaryFormat Format);
 
 llvm::Expected<ModuleUPtr>
 compileDeviceCode(InMemoryFile SourceFile, View<InMemoryFile> IncludeFiles,
                   const llvm::opt::InputArgList &UserArgList,
-                  std::string &BuildLog, llvm::LLVMContext &Context);
+                  std::string &BuildLog, llvm::LLVMContext &Context,
+                  BinaryFormat Format);
 
 llvm::Error linkDeviceLibraries(llvm::Module &Module,
                                 const llvm::opt::InputArgList &UserArgList,
-                                std::string &BuildLog);
+                                std::string &BuildLog, BinaryFormat Format);
 
 using PostLinkResult = std::pair<RTCBundleInfo, llvm::SmallVector<ModuleUPtr>>;
 llvm::Expected<PostLinkResult>
diff --git a/sycl-jit/jit-compiler/lib/rtc/RTC.cpp b/sycl-jit/jit-compiler/lib/rtc/RTC.cpp
index e9307ccb27eed..2e42bca3a49a5 100644
--- a/sycl-jit/jit-compiler/lib/rtc/RTC.cpp
+++ b/sycl-jit/jit-compiler/lib/rtc/RTC.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "RTC.h"
+#include "JITBinaryInfo.h"
 #include "helper/ErrorHelper.h"
 #include "rtc/DeviceCompilation.h"
 #include "translation/SPIRVLLVMTranslation.h"
@@ -26,7 +27,8 @@ using namespace jit_compiler;
 
 JIT_EXPORT_SYMBOL RTCHashResult calculateHash(InMemoryFile SourceFile,
                                               View<InMemoryFile> IncludeFiles,
-                                              View<const char *> UserArgs) {
+                                              View<const char *> UserArgs,
+                                              BinaryFormat Format) {
   llvm::opt::InputArgList UserArgList;
   if (auto Error = parseUserArgs(UserArgs).moveInto(UserArgList)) {
     return errorTo<RTCHashResult>(std::move(Error),
@@ -36,8 +38,8 @@ JIT_EXPORT_SYMBOL RTCHashResult calculateHash(InMemoryFile SourceFile,
 
   auto Start = std::chrono::high_resolution_clock::now();
   std::string Hash;
-  if (auto Error =
-          calculateHash(SourceFile, IncludeFiles, UserArgList).moveInto(Hash)) {
+  if (auto Error = calculateHash(SourceFile, IncludeFiles, UserArgList, Format)
+                       .moveInto(Hash)) {
     return errorTo<RTCHashResult>(std::move(Error), "Hashing failed",
                                   /*IsHash=*/false);
   }
@@ -55,7 +57,8 @@ JIT_EXPORT_SYMBOL RTCHashResult calculateHash(InMemoryFile SourceFile,
 JIT_EXPORT_SYMBOL RTCResult compileSYCL(InMemoryFile SourceFile,
                                         View<InMemoryFile> IncludeFiles,
                                         View<const char *> UserArgs,
-                                        View<char> CachedIR, bool SaveIR) {
+                                        View<char> CachedIR, bool SaveIR,
+                                        BinaryFormat Format) {
   llvm::LLVMContext Context;
   std::string BuildLog;
   configureDiagnostics(Context, BuildLog);
@@ -104,7 +107,7 @@ JIT_EXPORT_SYMBOL RTCResult compileSYCL(InMemoryFile SourceFile,
   bool FromSource = !Module;
   if (FromSource) {
     if (auto Error = compileDeviceCode(SourceFile, IncludeFiles, UserArgList,
-                                       BuildLog, Context)
+                                       BuildLog, Context, Format)
                          .moveInto(Module)) {
       return errorTo<RTCResult>(std::move(Error), "Device compilation failed");
     }
@@ -118,7 +121,8 @@ JIT_EXPORT_SYMBOL RTCResult compileSYCL(InMemoryFile SourceFile,
     IR = RTCDeviceCodeIR{BCString.data(), BCString.data() + BCString.size()};
   }
 
-  if (auto Error = linkDeviceLibraries(*Module, UserArgList, BuildLog)) {
+  if (auto Error =
+          linkDeviceLibraries(*Module, UserArgList, BuildLog, Format)) {
     return errorTo<RTCResult>(std::move(Error), "Device linking failed");
   }
 
@@ -131,9 +135,9 @@ JIT_EXPORT_SYMBOL RTCResult compileSYCL(InMemoryFile SourceFile,
 
   for (auto [DevImgInfo, Module] :
        llvm::zip_equal(BundleInfo.DevImgInfos, Modules)) {
-    if (auto Error = Translator::translate(*Module, JITContext::getInstance(),
-                                           BinaryFormat::SPIRV)
-                         .moveInto(DevImgInfo.BinaryInfo)) {
+    if (auto Error =
+            Translator::translate(*Module, JITContext::getInstance(), Format)
+                .moveInto(DevImgInfo.BinaryInfo)) {
       return errorTo<RTCResult>(std::move(Error), "SPIR-V translation failed");
     }
   }
diff --git a/sycl-jit/jit-compiler/lib/translation/Translation.cpp b/sycl-jit/jit-compiler/lib/translation/Translation.cpp
index 457b6bfc0e637..99eaffd3384d1 100644
--- a/sycl-jit/jit-compiler/lib/translation/Translation.cpp
+++ b/sycl-jit/jit-compiler/lib/translation/Translation.cpp
@@ -83,9 +83,6 @@ llvm::Expected<JITBinary *> Translator::translateToPTX(llvm::Module &Mod,
   LLVMInitializeNVPTXAsmPrinter();
   LLVMInitializeNVPTXTargetMC();
 
-  static const char *TARGET_CPU_ATTRIBUTE = "target-cpu";
-  static const char *TARGET_FEATURE_ATTRIBUTE = "target-features";
-
   std::string TargetTriple{"nvptx64-nvidia-cuda"};
 
   std::string ErrorMessage;
@@ -99,32 +96,11 @@ llvm::Expected<JITBinary *> Translator::translateToPTX(llvm::Module &Mod,
         ErrorMessage.c_str());
   }
 
-  // Give priority to user specified values (through environment variables:
-  // SYCL_JIT_AMDGCN_PTX_TARGET_CPU and SYCL_JIT_AMDGCN_PTX_TARGET_FEATURES).
-  auto CPUVal = ConfigHelper::get<option::JITTargetCPU>();
-  auto FeaturesVal = ConfigHelper::get<option::JITTargetFeatures>();
-  llvm::StringRef CPU = CPUVal.begin();
-  llvm::StringRef Features = FeaturesVal.begin();
-
-  auto *KernelFunc = KernelName ? Mod.getFunction(KernelName) : nullptr;
-  // If they were not set, use default and consult the module for alternatives
-  // (if present).
-  if (CPU.empty()) {
-    CPU = "sm_50";
-    if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) {
-      CPU = KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString();
-    }
-  }
-  if (Features.empty()) {
-    Features = "+sm_50,+ptx76";
-    if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) {
-      Features = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE)
-                     .getValueAsString();
-    }
-  }
+  auto [CPU, Features] =
+      getTargetCPUAndFeatureAttrs(&Mod, KernelName, BinaryFormat::PTX);
 
   std::unique_ptr<TargetMachine> TargetMachine(Target->createTargetMachine(
-      Mod.getTargetTriple(), CPU, Features, {}, llvm::Reloc::PIC_, std::nullopt,
+      Triple{TargetTriple}, CPU, Features, {}, llvm::Reloc::PIC_, std::nullopt,
       llvm::CodeGenOptLevel::Default));
 
   llvm::legacy::PassManager PM;
@@ -166,9 +142,6 @@ Translator::translateToAMDGCN(llvm::Module &Mod, JITContext &JITCtx,
   LLVMInitializeAMDGPUAsmPrinter();
   LLVMInitializeAMDGPUTargetMC();
 
-  static const char *TARGET_CPU_ATTRIBUTE = "target-cpu";
-  static const char *TARGET_FEATURE_ATTRIBUTE = "target-features";
-
   std::string TargetTriple{"amdgcn-amd-amdhsa"};
 
   std::string ErrorMessage;
@@ -181,29 +154,10 @@ Translator::translateToAMDGCN(llvm::Module &Mod, JITContext &JITCtx,
         "Failed to load and translate AMDGCN LLVM IR module with error %s",
         ErrorMessage.c_str());
 
-  auto CPUVal = ConfigHelper::get<option::JITTargetCPU>();
-  auto FeaturesVal = ConfigHelper::get<option::JITTargetFeatures>();
-  llvm::StringRef CPU = CPUVal.begin();
-  llvm::StringRef Features = FeaturesVal.begin();
-
-  auto *KernelFunc = KernelName ? Mod.getFunction(KernelName) : nullptr;
-  if (CPU.empty()) {
-    // Set to the lowest tested target according to the GetStartedGuide, section
-    // "Build DPC++ toolchain with support for HIP AMD"
-    CPU = "gfx906";
-    if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) {
-      CPU = KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString();
-    }
-  }
-  if (Features.empty()) {
-    if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) {
-      Features = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE)
-                     .getValueAsString();
-    }
-  }
-
+  auto [CPU, Features] =
+      getTargetCPUAndFeatureAttrs(&Mod, KernelName, BinaryFormat::AMDGCN);
   std::unique_ptr<TargetMachine> TargetMachine(Target->createTargetMachine(
-      Mod.getTargetTriple(), CPU, Features, {}, llvm::Reloc::PIC_, std::nullopt,
+      Triple{TargetTriple}, CPU, Features, {}, llvm::Reloc::PIC_, std::nullopt,
       llvm::CodeGenOptLevel::Default));
 
   std::string AMDObj;
@@ -226,3 +180,34 @@ Translator::translateToAMDGCN(llvm::Module &Mod, JITContext &JITCtx,
   return &JITCtx.emplaceBinary(std::move(AMDObj), BinaryFormat::AMDGCN);
 #endif // JIT_SUPPORT_AMDGCN
 }
+
+std::pair<std::string, std::string> Translator::getTargetCPUAndFeatureAttrs(
+    const llvm::Module *M, const char *KernelName, BinaryFormat Format) {
+  assert((Format == BinaryFormat::AMDGCN || Format == BinaryFormat::PTX) &&
+         "Unexpected format found");
+  static const char *TARGET_CPU_ATTRIBUTE = "target-cpu";
+  static const char *TARGET_FEATURE_ATTRIBUTE = "target-features";
+  // Give priority to user specified values (through environment variables:
+  // SYCL_JIT_AMDGCN_PTX_TARGET_CPU and SYCL_JIT_AMDGCN_PTX_TARGET_FEATURES).
+  auto *KernelFunc = (M && KernelName) ? M->getFunction(KernelName) : nullptr;
+  auto CPUVal = ConfigHelper::get<option::JITTargetCPU>();
+  auto FeaturesVal = ConfigHelper::get<option::JITTargetFeatures>();
+  llvm::StringRef CPU = CPUVal.begin();
+  llvm::StringRef Features = FeaturesVal.begin();
+  if (CPU.empty()) {
+    // Set to the lowest tested target according to the GetStartedGuide, section
+    // "Build DPC++ toolchain with support for HIP AMD"
+    CPU = Format == BinaryFormat::AMDGCN ? "gfx90a" : "sm_50";
+    if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_CPU_ATTRIBUTE)) {
+      CPU = KernelFunc->getFnAttribute(TARGET_CPU_ATTRIBUTE).getValueAsString();
+    }
+  }
+  if (Features.empty()) {
+    Features = Format == BinaryFormat::PTX ? "+sm_50,+ptx76" : "";
+    if (KernelFunc && KernelFunc->hasFnAttribute(TARGET_FEATURE_ATTRIBUTE)) {
+      Features = KernelFunc->getFnAttribute(TARGET_FEATURE_ATTRIBUTE)
+                     .getValueAsString();
+    }
+  }
+  return std::make_pair(std::string{CPU}, std::string{Features});
+}
diff --git a/sycl-jit/jit-compiler/lib/translation/Translation.h b/sycl-jit/jit-compiler/lib/translation/Translation.h
index 9c8dfdda5f9f5..78bf4ef8ac416 100644
--- a/sycl-jit/jit-compiler/lib/translation/Translation.h
+++ b/sycl-jit/jit-compiler/lib/translation/Translation.h
@@ -12,7 +12,6 @@
 #include "JITContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Error.h"
-#include <vector>
 
 namespace jit_compiler {
 
@@ -26,6 +25,9 @@ class Translator {
   translate(llvm::Module &Mod, JITContext &JITCtx, BinaryFormat Format,
             const char *KernelName = nullptr);
 
+  std::pair<std::string, std::string> static getTargetCPUAndFeatureAttrs(
+      const llvm::Module *M, const char *KernelName, BinaryFormat Format);
+
 private:
   /// Pair of address and size to represent a binary blob.
   using BinaryBlob = std::pair<BinaryAddress, size_t>;
diff --git a/sycl/source/detail/device_image_impl.hpp b/sycl/source/detail/device_image_impl.hpp
index bab3b9291d44f..c49409561f123 100644
--- a/sycl/source/detail/device_image_impl.hpp
+++ b/sycl/source/detail/device_image_impl.hpp
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include "JITBinaryInfo.h"
 #include <detail/adapter_impl.hpp>
 #include <detail/compiler.hpp>
 #include <detail/context_impl.hpp>
@@ -699,6 +700,22 @@ class device_image_impl
     return MRTCBinInfo && MRTCBinInfo->MLanguage == Lang;
   }
 
+  static ::jit_compiler::BinaryFormat getTargetFormat(const backend Backend) {
+    switch (Backend) {
+    case backend::ext_oneapi_level_zero:
+    case backend::opencl:
+      return ::jit_compiler::BinaryFormat::SPIRV;
+    case backend::ext_oneapi_cuda:
+      return ::jit_compiler::BinaryFormat::PTX;
+    case backend::ext_oneapi_hip:
+      return ::jit_compiler::BinaryFormat::AMDGCN;
+    default:
+      throw sycl::exception(
+          sycl::make_error_code(sycl::errc::invalid),
+          "Backend does not support kernel_compiler extension");
+    }
+  }
+
   std::vector<std::shared_ptr<device_image_impl>> buildFromSource(
       const std::vector<device> &Devices,
       const std::vector<sycl::detail::string_view> &BuildOptions,
@@ -729,9 +746,12 @@ class device_image_impl
       }
     }
 
-    if (MRTCBinInfo->MLanguage == syclex::source_language::sycl)
+    if (MRTCBinInfo->MLanguage == syclex::source_language::sycl) {
+      const auto Format = getTargetFormat(MContext.get_backend());
       return createSYCLImages(Devices, bundle_state::executable, BuildOptions,
-                              LogPtr, RegisteredKernelNames, OutDeviceBins);
+                              LogPtr, RegisteredKernelNames, OutDeviceBins,
+                              Format);
+    }
 
     std::vector<ur_device_handle_t> DeviceVec;
     DeviceVec.reserve(Devices.size());
@@ -819,8 +839,10 @@ class device_image_impl
                               "device does not support source language");
       }
     }
+    const auto Format = getTargetFormat(MContext.get_backend());
     return createSYCLImages(Devices, bundle_state::object, CompileOptions,
-                            LogPtr, RegisteredKernelNames, OutDeviceBins);
+                            LogPtr, RegisteredKernelNames, OutDeviceBins,
+                            Format);
   }
 
 private:
@@ -992,8 +1014,8 @@ class device_image_impl
       const std::vector<sycl::detail::string_view> &Options,
       std::string *LogPtr,
       const std::vector<sycl::detail::string_view> &RegisteredKernelNames,
-      std::vector<std::shared_ptr<ManagedDeviceBinaries>> &OutDeviceBins)
-      const {
+      std::vector<std::shared_ptr<ManagedDeviceBinaries>> &OutDeviceBins,
+      ::jit_compiler::BinaryFormat Format) const {
     assert(MRTCBinInfo);
     assert(MRTCBinInfo->MLanguage == syclex::source_language::sycl);
     assert(std::holds_alternative<std::string>(MBinImage));
@@ -1023,7 +1045,7 @@ class device_image_impl
 
     auto [Binaries, Prefix] = syclex::detail::SYCL_JIT_Compile(
         RegisteredKernelNames.empty() ? SourceStr : SourceExt.str(),
-        MRTCBinInfo->MIncludePairs, Options, LogPtr);
+        MRTCBinInfo->MIncludePairs, Options, LogPtr, Format);
 
     auto &PM = detail::ProgramManager::getInstance();
 
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
index 189bd3d145309..2884e25399e5f 100644
--- a/sycl/source/detail/jit_compiler.cpp
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -263,6 +263,7 @@ sycl_device_binaries jit_compiler::createDeviceBinaries(
       Binary.addOffloadEntry(std::move(Entry));
     }
 
+    bool FinalizationTagAdded = false;
     for (const auto &FPS : DevImgInfo.Properties) {
       bool IsDeviceGlobalsPropSet =
           FPS.Name == __SYCL_PROPERTY_SET_SYCL_DEVICE_GLOBALS;
@@ -279,18 +280,50 @@ sycl_device_binaries jit_compiler::createDeviceBinaries(
               sycl_property_type::SYCL_PROPERTY_TYPE_BYTE_ARRAY});
         }
       }
+      if (FPS.Name == __SYCL_PROPERTY_SET_PROGRAM_METADATA &&
+          DevImgInfo.BinaryInfo.Format ==
+              ::jit_compiler::BinaryFormat::AMDGCN) {
+        PropSet.addProperty(PropertyContainer{
+            __SYCL_PROGRAM_METADATA_TAG_NEED_FINALIZATION, 1});
+        FinalizationTagAdded = true;
+      }
       Binary.addProperty(std::move(PropSet));
 
       Binary.setCompileOptions(BundleInfo.CompileOptions.c_str());
     }
 
-    Collection->addDeviceBinary(std::move(Binary),
-                                DevImgInfo.BinaryInfo.BinaryStart,
-                                DevImgInfo.BinaryInfo.BinarySize,
-                                (DevImgInfo.BinaryInfo.AddressBits == 64)
-                                    ? __SYCL_DEVICE_BINARY_TARGET_SPIRV64
-                                    : __SYCL_DEVICE_BINARY_TARGET_SPIRV32,
-                                SYCL_DEVICE_BINARY_TYPE_SPIRV);
+    auto BinaryTarget = "";
+    auto Format = SYCL_DEVICE_BINARY_TYPE_NONE;
+    switch (DevImgInfo.BinaryInfo.Format) {
+    case ::jit_compiler::BinaryFormat::SPIRV:
+      BinaryTarget = DevImgInfo.BinaryInfo.AddressBits == 64
+                         ? __SYCL_DEVICE_BINARY_TARGET_SPIRV64
+                         : __SYCL_DEVICE_BINARY_TARGET_SPIRV32;
+      Format = SYCL_DEVICE_BINARY_TYPE_SPIRV;
+      break;
+    case ::jit_compiler::BinaryFormat::PTX:
+      BinaryTarget = __SYCL_DEVICE_BINARY_TARGET_LLVM_NVPTX64;
+      Format = SYCL_DEVICE_BINARY_TYPE_NONE;
+      break;
+    case ::jit_compiler::BinaryFormat::AMDGCN: {
+      BinaryTarget = __SYCL_DEVICE_BINARY_TARGET_LLVM_AMDGCN;
+      Format = SYCL_DEVICE_BINARY_TYPE_NONE;
+      // If the program had no properties, the tag needs to be added now.
+      if (!FinalizationTagAdded) {
+        PropertySetContainer ProgramMetadata{
+            __SYCL_PROPERTY_SET_PROGRAM_METADATA};
+        ProgramMetadata.addProperty(PropertyContainer{
+            __SYCL_PROGRAM_METADATA_TAG_NEED_FINALIZATION, 1});
+        Binary.addProperty(std::move(ProgramMetadata));
+      }
+      break;
+    }
+    default:
+      assert(false && "Unsupported format");
+    };
+    Collection->addDeviceBinary(
+        std::move(Binary), DevImgInfo.BinaryInfo.BinaryStart,
+        DevImgInfo.BinaryInfo.BinarySize, BinaryTarget, Format);
   }
 
   sycl_device_binaries Binaries = Collection->getPIDeviceStruct();
@@ -311,7 +344,26 @@ void jit_compiler::destroyDeviceBinaries(sycl_device_binaries Binaries) {
 std::pair<sycl_device_binaries, std::string> jit_compiler::compileSYCL(
     const std::string &CompilationID, const std::string &SYCLSource,
     const std::vector<std::pair<std::string, std::string>> &IncludePairs,
-    const std::vector<std::string> &UserArgs, std::string *LogPtr) {
+    const std::vector<std::string> &UserArgs, std::string *LogPtr,
+    ::jit_compiler::BinaryFormat Format) {
+  if (Format == ::jit_compiler::BinaryFormat::PTX ||
+      Format == ::jit_compiler::BinaryFormat::AMDGCN) {
+    // If present, set-up the config with env variables describing CPU and
+    // features.
+    auto SetUpOption = [](const std::string &Value) {
+      ::jit_compiler::JITEnvVar Option(Value.begin(), Value.end());
+      return Option;
+    };
+    ::jit_compiler::JITEnvVar TargetCPUOpt = SetUpOption(
+        detail::SYCLConfig<detail::SYCL_JIT_AMDGCN_PTX_TARGET_CPU>::get());
+    this->AddToConfigHandle(
+        ::jit_compiler::option::JITTargetCPU::set(TargetCPUOpt));
+    ::jit_compiler::JITEnvVar TargetFeaturesOpt = SetUpOption(
+        detail::SYCLConfig<detail::SYCL_JIT_AMDGCN_PTX_TARGET_FEATURES>::get());
+    this->AddToConfigHandle(
+        ::jit_compiler::option::JITTargetFeatures::set(TargetFeaturesOpt));
+  }
+
   auto appendToLog = [LogPtr](const char *Msg) {
     if (LogPtr) {
       LogPtr->append(Msg);
@@ -339,7 +391,7 @@ std::pair<sycl_device_binaries, std::string> jit_compiler::compileSYCL(
   std::vector<char> CachedIR;
   if (PersistentDeviceCodeCache::isEnabled()) {
     auto Result =
-        CalculateHashHandle(SourceFile, IncludeFilesView, UserArgsView);
+        CalculateHashHandle(SourceFile, IncludeFilesView, UserArgsView, Format);
 
     if (Result.failed()) {
       appendToLog(Result.getPreprocLog());
@@ -349,8 +401,9 @@ std::pair<sycl_device_binaries, std::string> jit_compiler::compileSYCL(
     }
   }
 
-  auto Result = CompileSYCLHandle(SourceFile, IncludeFilesView, UserArgsView,
-                                  CachedIR, /*SaveIR=*/!CacheKey.empty());
+  auto Result =
+      CompileSYCLHandle(SourceFile, IncludeFilesView, UserArgsView, CachedIR,
+                        /*SaveIR=*/!CacheKey.empty(), Format);
 
   const char *BuildLog = Result.getBuildLog();
   appendToLog(BuildLog);
diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp
index 0f0a3f3f5738c..e0aefd1deae5d 100644
--- a/sycl/source/detail/jit_compiler.hpp
+++ b/sycl/source/detail/jit_compiler.hpp
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include <JITBinaryInfo.h>
 #include <detail/jit_device_binaries.hpp>
 #include <detail/queue_impl.hpp>
 #include <sycl/detail/kernel_name_str_t.hpp>
@@ -45,7 +46,8 @@ class jit_compiler {
   std::pair<sycl_device_binaries, std::string> compileSYCL(
       const std::string &CompilationID, const std::string &SYCLSource,
       const std::vector<std::pair<std::string, std::string>> &IncludePairs,
-      const std::vector<std::string> &UserArgs, std::string *LogPtr);
+      const std::vector<std::string> &UserArgs, std::string *LogPtr,
+      ::jit_compiler::BinaryFormat Format);
 
   void destroyDeviceBinaries(sycl_device_binaries Binaries);
 
diff --git a/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.cpp b/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.cpp
index 93ec94a8ac328..418842fdc48ec 100644
--- a/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.cpp
+++ b/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.cpp
@@ -44,7 +44,8 @@ std::pair<sycl_device_binaries, std::string> SYCL_JIT_Compile(
     [[maybe_unused]] const std::string &SYCLSource,
     [[maybe_unused]] const include_pairs_t &IncludePairs,
     [[maybe_unused]] const std::vector<sycl::detail::string_view> &UserArgs,
-    [[maybe_unused]] std::string *LogPtr) {
+    [[maybe_unused]] std::string *LogPtr,
+    [[maybe_unused]] ::jit_compiler::BinaryFormat Format) {
 #if SYCL_EXT_JIT_ENABLE
   static std::atomic_uintptr_t CompilationCounter;
   std::string CompilationID = "rtc_" + std::to_string(CompilationCounter++);
@@ -52,7 +53,7 @@ std::pair<sycl_device_binaries, std::string> SYCL_JIT_Compile(
   for (const sycl::detail::string_view UserArg : UserArgs)
     UserArgStrings.push_back(UserArg.data());
   return sycl::detail::jit_compiler::get_instance().compileSYCL(
-      CompilationID, SYCLSource, IncludePairs, UserArgStrings, LogPtr);
+      CompilationID, SYCLSource, IncludePairs, UserArgStrings, LogPtr, Format);
 #else
   throw sycl::exception(sycl::errc::build,
                         "kernel_compiler via sycl-jit is not available");
diff --git a/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.hpp b/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.hpp
index 047a10a061e1f..007dfae9aff22 100644
--- a/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.hpp
+++ b/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.hpp
@@ -8,6 +8,7 @@
 
 #pragma once
 
+#include <JITBinaryInfo.h>
 #include <sycl/detail/defines_elementary.hpp>
 #include <sycl/detail/export.hpp> // __SYCL_EXPORT
 #include <sycl/detail/string_view.hpp>
@@ -35,7 +36,7 @@ userArgsAsString(const std::vector<sycl::detail::string_view> &UserArguments);
 std::pair<sycl_device_binaries, std::string>
 SYCL_JIT_Compile(const std::string &Source, const include_pairs_t &IncludePairs,
                  const std::vector<sycl::detail::string_view> &UserArgs,
-                 std::string *LogPtr);
+                 std::string *LogPtr, ::jit_compiler::BinaryFormat Format);
 
 void SYCL_JIT_Destroy(sycl_device_binaries Binaries);
 
diff --git a/sycl/source/kernel_bundle.cpp b/sycl/source/kernel_bundle.cpp
index a7e0f9289aac0..8fbeb5e98bc57 100644
--- a/sycl/source/kernel_bundle.cpp
+++ b/sycl/source/kernel_bundle.cpp
@@ -409,8 +409,10 @@ bool is_source_kernel_bundle_supported(
     const std::vector<device_impl *> &DeviceImplVec) {
   backend BE = DeviceImplVec[0]->getBackend();
   // Support is limited to the opencl and level_zero backends.
-  bool BE_Acceptable = (BE == sycl::backend::ext_oneapi_level_zero) ||
-                       (BE == sycl::backend::opencl);
+  bool BE_Acceptable =
+      (BE == sycl::backend::ext_oneapi_level_zero) ||
+      (BE == sycl::backend::opencl || BE == sycl::backend::ext_oneapi_hip ||
+       BE == sycl::backend::ext_oneapi_cuda);
   if (!BE_Acceptable)
     return false;
 
diff --git a/sycl/test-e2e/KernelCompiler/sycl.cpp b/sycl/test-e2e/KernelCompiler/sycl.cpp
index fac041254778a..10d4fe69b839d 100644
--- a/sycl/test-e2e/KernelCompiler/sycl.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl.cpp
@@ -6,14 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-// REQUIRES: (opencl || level_zero)
 // REQUIRES: aspect-usm_device_allocations
 
 // UNSUPPORTED: accelerator
 // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there.
 
 // RUN: %{build} -o %t.out
-// RUN: %{l0_leak_check} %{run} %t.out
+// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out
 
 // XFAIL: preview-mode && run-mode
 // XFAIL-TRACKER: https://github.com/intel/llvm/issues/18390
@@ -111,15 +110,14 @@ auto constexpr DeviceLibrariesSource = R"===(
 #include <sycl/sycl.hpp>
 #include <cmath>
 #include <complex>
-#include <sycl/ext/intel/math.hpp>
 
 extern "C" SYCL_EXTERNAL 
 SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(sycl::ext::oneapi::experimental::single_task_kernel)
 void device_libs_kernel(float *ptr) {
   // Extension list: llvm/lib/SYCLLowerIR/SYCLDeviceLibReqMask.cpp
 
-  // cl_intel_devicelib_assert is not available for opencl:gpu; skip testing it.
-  // Only test the fp32 variants of complex, math and imf to keep this test
+  // cl_intel_devicelib_assert is not available for opencl:gpu; skip testing
+  // it. Only test the fp32 variants of complex and math to keep this test
   // device-agnostic.
   
   // cl_intel_devicelib_math
@@ -131,14 +129,8 @@ void device_libs_kernel(float *ptr) {
   // cl_intel_devicelib_cstring
   ptr[2] = memcmp(ptr + 2, ptr + 2, sizeof(float));
 
-  // cl_intel_devicelib_imf
-  ptr[3] = sycl::ext::intel::math::sqrt(ptr[3] * 2);
-
-  // cl_intel_devicelib_imf_bf16
-  ptr[4] = sycl::ext::intel::math::float2bfloat16(ptr[4] * 0.5f);
-
   // cl_intel_devicelib_bfloat16
-  ptr[5] = sycl::ext::oneapi::bfloat16{ptr[5] / 0.25f};
+  ptr[3] = sycl::ext::oneapi::bfloat16{ptr[3] / 0.25f};
 }
 )===";
 
@@ -359,7 +351,7 @@ int test_device_libraries(sycl::queue q) {
   exe_kb kbExe = syclex::build(kbSrc);
 
   sycl::kernel k = kbExe.ext_oneapi_get_kernel("device_libs_kernel");
-  constexpr size_t nElem = 6;
+  constexpr size_t nElem = 4;
   float *ptr = sycl::malloc_shared<float>(nElem, q);
   for (int i = 0; i < nElem; ++i)
     ptr[i] = 1.0f;
diff --git a/sycl/test-e2e/KernelCompiler/sycl_basic.cpp b/sycl/test-e2e/KernelCompiler/sycl_basic.cpp
index 5c212ee0d843d..033070acca675 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_basic.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_basic.cpp
@@ -6,14 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-// REQUIRES: (opencl || level_zero)
 // REQUIRES: aspect-usm_device_allocations
 
 // UNSUPPORTED: accelerator
 // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there.
 
 // RUN: %{build} -o %t.out
-// RUN: %{l0_leak_check} %{run} %t.out
+// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out
 
 #include <sycl/detail/core.hpp>
 #include <sycl/kernel_bundle.hpp>
diff --git a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp
index 1ec2f744b632d..15d3fe829ba70 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_cache.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_cache.cpp
@@ -7,6 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 // REQUIRES: (opencl || level_zero)
+// Unlike other RTC tests, don't run this one on Cuda/HIP. Eviction mechanism
+// is based on the size of compiled kernels, which in turns depends on the
+// target. Don't run eviction check for CUDA/HIP, so that we don't have to find
+// a magic number that works for all binaries (and by definition is flaky).
+
 // REQUIRES: aspect-usm_device_allocations
 
 // UNSUPPORTED: accelerator
diff --git a/sycl/test-e2e/KernelCompiler/sycl_context_error.cpp b/sycl/test-e2e/KernelCompiler/sycl_context_error.cpp
index d111e77960e48..6f27ee95d84a8 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_context_error.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_context_error.cpp
@@ -6,13 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-// REQUIRES: (opencl || level_zero)
-
 // UNSUPPORTED: accelerator
 // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there.
 
 // RUN: %{build} -o %t.out
-// RUN: %{run-unfiltered-devices} %t.out
+// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{run-unfiltered-devices} %t.out
 
 #include <sycl/detail/core.hpp>
 #include <sycl/kernel_bundle.hpp>
diff --git a/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp b/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp
index ceda7252369e5..3f10968c45f57 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_device_globals.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// REQUIRES: (opencl || level_zero)
 // REQUIRES: aspect-usm_device_allocations
 
 // UNSUPPORTED: accelerator, opencl && gpu
@@ -15,7 +14,7 @@
 // UNSUPPORTED-TRACKER: GSD-4287
 
 // RUN: %{build} -o %t.out
-// RUN: %{l0_leak_check} %{run} %t.out
+// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out
 
 #include <sycl/detail/core.hpp>
 #include <sycl/kernel_bundle.hpp>
diff --git a/sycl/test-e2e/KernelCompiler/sycl_export_registration.cpp b/sycl/test-e2e/KernelCompiler/sycl_export_registration.cpp
index 789bab6223546..e8390ced06efb 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_export_registration.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_export_registration.cpp
@@ -11,6 +11,8 @@
 // UNSUPPORTED: accelerator
 // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there.
 
+// Note linking is not supported on CUDA/HIP.
+
 // -- Test for a case where a kernel bundle is built that exports a symbol and
 // -- other kernel bundles that uses it are compiled/linked without it. These
 // -- cases should fail due to unresolved symbols, rather than picking up the
diff --git a/sycl/test-e2e/KernelCompiler/sycl_imf.cpp b/sycl/test-e2e/KernelCompiler/sycl_imf.cpp
new file mode 100644
index 0000000000000..142cc2160ac52
--- /dev/null
+++ b/sycl/test-e2e/KernelCompiler/sycl_imf.cpp
@@ -0,0 +1,79 @@
+//==--- sycl_imf.cpp --- kernel_compiler extension imf tests ---------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: aspect-usm_device_allocations
+// REQUIRES: (opencl || level_zero)
+
+// UNSUPPORTED: accelerator
+// UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there.
+
+// RUN: %{build} -o %t.out
+// RUN: %{l0_leak_check} %{run} %t.out
+
+// XFAIL: preview-mode && run-mode
+// XFAIL-TRACKER: https://github.com/intel/llvm/issues/18390
+
+#include <sycl/detail/core.hpp>
+#include <sycl/kernel_bundle.hpp>
+#include <sycl/usm.hpp>
+
+auto constexpr IMFSources = R"===(
+#include <sycl/sycl.hpp>
+#include <cmath>
+#include <sycl/ext/intel/math.hpp>
+
+extern "C" SYCL_EXTERNAL 
+SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(sycl::ext::oneapi::experimental::single_task_kernel)
+void imf_kernel(float *ptr) {
+  // cl_intel_devicelib_imf
+  ptr[0] = sycl::ext::intel::math::sqrt(ptr[0] * 2);
+
+  // cl_intel_devicelib_imf_bf16
+  ptr[1] = sycl::ext::intel::math::float2bfloat16(ptr[1] * 0.5f);
+}
+)===";
+
+int main() {
+#ifdef SYCL_EXT_ONEAPI_KERNEL_COMPILER
+  sycl::queue q;
+  namespace syclex = sycl::ext::oneapi::experimental;
+  using source_kb = sycl::kernel_bundle<sycl::bundle_state::ext_oneapi_source>;
+  using exe_kb = sycl::kernel_bundle<sycl::bundle_state::executable>;
+
+  sycl::context ctx = q.get_context();
+
+  source_kb kbSrc = syclex::create_kernel_bundle_from_source(
+      ctx, syclex::source_language::sycl, IMFSources);
+  exe_kb kbExe = syclex::build(kbSrc);
+
+  sycl::kernel k = kbExe.ext_oneapi_get_kernel("imf_kernel");
+  constexpr size_t nElem = 2;
+  float *ptr = sycl::malloc_shared<float>(nElem, q);
+  for (int i = 0; i < nElem; ++i)
+    ptr[i] = 1.0f;
+
+  q.submit([&](sycl::handler &cgh) {
+    cgh.set_arg(0, ptr);
+    cgh.single_task(k);
+  });
+  q.wait_and_throw();
+
+  // Check that the kernel was executed. Given the {1.0, 1.0} input,
+  // the expected result is approximately {1.41, 0.5}.
+  for (unsigned i = 0; i < nElem; ++i) {
+    std::cout << ptr[i] << ' ';
+    assert(ptr[i] != 1.0f);
+  }
+  std::cout << std::endl;
+
+  sycl::free(ptr, q);
+#else
+  static_assert(false, "Kernel Compiler feature test macro undefined");
+#endif
+  return 0;
+}
diff --git a/sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp b/sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp
index bf0d46b5800be..b65e25ccb445b 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_include_paths.cpp
@@ -6,18 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-// REQUIRES: (opencl || level_zero)
 // REQUIRES: aspect-usm_device_allocations
-
 // UNSUPPORTED: accelerator
 // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there.
 
 // RUN: %{build} -o %t.out
-// RUN: %{run} %t.out %S | FileCheck %s --check-prefixes=CHECK,CHECK-NOCWD
+// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{run} %t.out %S | FileCheck %s --check-prefixes=CHECK,CHECK-NOCWD
 
 // COM: Run test again in a directory that contains a different version of
 //      `header1.hpp`
-// RUN: cd %S/include/C ; %{run} %t.out %S | FileCheck %s --check-prefixes=CHECK,CHECK-CWD
+// RUN: cd %S/include/C ; %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{run} %t.out %S | FileCheck %s --check-prefixes=CHECK,CHECK-CWD
 
 #include <sycl/detail/core.hpp>
 #include <sycl/kernel_bundle.hpp>
diff --git a/sycl/test-e2e/KernelCompiler/sycl_join.cpp b/sycl/test-e2e/KernelCompiler/sycl_join.cpp
index 4ec0ec5c35604..8d013f9713cc7 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_join.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_join.cpp
@@ -6,15 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-// REQUIRES: (opencl || level_zero)
 // REQUIRES: aspect-usm_shared_allocations
 
 // UNSUPPORTED: accelerator
 // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there.
 
 // RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-// RUN: %{l0_leak_check} %{run} %t.out
+// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{run} %t.out
+// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out
 
 // XFAIL: preview-mode && run-mode
 // XFAIL-TRACKER: https://github.com/intel/llvm/issues/18390
diff --git a/sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp b/sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp
index 4ef1cb2d88e9c..82b866353e615 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_lifetimes.cpp
@@ -6,14 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-// REQUIRES: (opencl || level_zero)
 // REQUIRES: aspect-usm_device_allocations
 
 // UNSUPPORTED: accelerator
 // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there.
 
 // RUN: %{build} -o %t.out
-// RUN: env SYCL_UR_TRACE=-1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s
+// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} env SYCL_UR_TRACE=-1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s
 
 #include <sycl/detail/core.hpp>
 #include <sycl/kernel_bundle.hpp>
@@ -51,14 +50,15 @@ int test_lifetimes() {
       ctx, syclex::source_language::sycl, SYCLSource);
 
   exe_kb kbExe1 = syclex::build(kbSrc);
-  // CHECK: urProgramCreateWithIL{{.*}}phProgram{{.*}}([[PROG1:.*]]))
+  // Cuda/Hip programs will be created with Binary, spirv IL.
+  // CHECK: urProgramCreateWith{{IL|Binary}}{{.*}}phProgram{{.*}}([[PROG1:.*]]))
 
   {
     std::cout << "Scope1\n";
     // CHECK: Scope1
     exe_kb kbExe2 = syclex::build(kbSrc);
     // kbExe2 goes out of scope; its kernels are removed from program mananager.
-    // CHECK: urProgramCreateWithIL{{.*}}phProgram{{.*}}([[PROG2:.*]]))
+    // CHECK: urProgramCreateWith{{IL|Binary}}{{.*}}phProgram{{.*}}([[PROG2:.*]]))
     // CHECK: urProgramRelease{{.*}}[[PROG2]]
   }
   std::cout << "End Scope1\n";
diff --git a/sycl/test-e2e/KernelCompiler/sycl_link.cpp b/sycl/test-e2e/KernelCompiler/sycl_link.cpp
index 9ff9878e387ee..17aedb9685360 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_link.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_link.cpp
@@ -15,6 +15,8 @@
 // -- Test for a simple linking case with source files compiled from SYCL source
 // -- at runtime.
 
+// Note linking is not supported on CUDA/HIP.
+
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // RUN: %{l0_leak_check} %{run} %t.out
diff --git a/sycl/test-e2e/KernelCompiler/sycl_link_common_dep.cpp b/sycl/test-e2e/KernelCompiler/sycl_link_common_dep.cpp
index 49d67205b8cd1..ebe9e1d8ab56e 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_link_common_dep.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_link_common_dep.cpp
@@ -12,6 +12,8 @@
 // UNSUPPORTED: accelerator
 // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there.
 
+// Note linking is not supported on CUDA/HIP.
+
 // -- Test for linking where two kernels use the same imported symbols.
 
 // RUN: %{build} -o %t.out
diff --git a/sycl/test-e2e/KernelCompiler/sycl_link_common_dep_optional_feature.cpp b/sycl/test-e2e/KernelCompiler/sycl_link_common_dep_optional_feature.cpp
index 76fe85ce72fa9..dbc31e63da125 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_link_common_dep_optional_feature.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_link_common_dep_optional_feature.cpp
@@ -12,6 +12,8 @@
 // UNSUPPORTED: accelerator
 // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there.
 
+// Note linking is not supported on CUDA/HIP.
+
 // -- Test for linking where two kernels use the same imported symbols, but one
 // -- may not be supported on the device.
 
diff --git a/sycl/test-e2e/KernelCompiler/sycl_link_export_conflict.cpp b/sycl/test-e2e/KernelCompiler/sycl_link_export_conflict.cpp
index f9d6381e08b38..465ce7b5136a4 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_link_export_conflict.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_link_export_conflict.cpp
@@ -12,6 +12,8 @@
 // UNSUPPORTED: accelerator
 // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there.
 
+// Note linking is not supported on CUDA/HIP.
+
 // -- Test for a case where a kernel bundle with an exported symbol is compiled
 // -- before another kernel bundle using a different variant of the symbol.
 
diff --git a/sycl/test-e2e/KernelCompiler/sycl_link_kernel_conflict.cpp b/sycl/test-e2e/KernelCompiler/sycl_link_kernel_conflict.cpp
index 7fe9b0fd9db79..5b2a0e41b33b3 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_link_kernel_conflict.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_link_kernel_conflict.cpp
@@ -12,6 +12,8 @@
 // UNSUPPORTED: accelerator
 // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there.
 
+// Note linking is not supported on CUDA/HIP.
+
 // -- Test for the linking of two kernels with conflicting definitions of
 // -- kernels with the same name.
 
diff --git a/sycl/test-e2e/KernelCompiler/sycl_namespaces.cpp b/sycl/test-e2e/KernelCompiler/sycl_namespaces.cpp
index fbe2a75daf617..713c220419c75 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_namespaces.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_namespaces.cpp
@@ -6,14 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-// REQUIRES: (opencl || level_zero)
 // REQUIRES: aspect-usm_device_allocations
 
 // UNSUPPORTED: accelerator
 // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there.
 
 // RUN: %{build} -o %t.out
-// RUN: %{l0_leak_check} %{run} %t.out
+// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out
 
 #include <sycl/detail/core.hpp>
 #include <sycl/kernel_bundle.hpp>
diff --git a/sycl/test-e2e/KernelCompiler/sycl_overload.cpp b/sycl/test-e2e/KernelCompiler/sycl_overload.cpp
index 97488d880993b..00ac6c76fa10c 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_overload.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_overload.cpp
@@ -6,14 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-// REQUIRES: (opencl || level_zero)
 // REQUIRES: aspect-usm_device_allocations
 
 // UNSUPPORTED: accelerator
 // UNSUPPORTED-INTENDED: while accelerator is AoT only, this cannot run there.
 
 // RUN: %{build} -o %t.out
-// RUN: %{l0_leak_check} %{run} %t.out
+// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{l0_leak_check} %{run} %t.out
 
 #include <sycl/detail/core.hpp>
 #include <sycl/kernel_bundle.hpp>
diff --git a/sycl/test-e2e/KernelCompiler/sycl_time_trace.cpp b/sycl/test-e2e/KernelCompiler/sycl_time_trace.cpp
index 77a2d89bed123..3d149adff8ebe 100644
--- a/sycl/test-e2e/KernelCompiler/sycl_time_trace.cpp
+++ b/sycl/test-e2e/KernelCompiler/sycl_time_trace.cpp
@@ -6,12 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-// REQUIRES: (opencl || level_zero)
 // UNSUPPORTED: accelerator
 // UNSUPPORTED-INTENDED: SYCL-RTC is not available for accelerator devices
 
 // RUN: %{build} -o %t.out
-// RUN: %{run} %t.out | FileCheck %s
+// RUN: %if hip %{ env SYCL_JIT_AMDGCN_PTX_TARGET_CPU=%{amd_arch} %} %{run} %t.out | FileCheck %s
 
 #include <sycl/detail/core.hpp>
 #include <sycl/kernel_bundle.hpp>
diff --git a/sycl/test-e2e/format.py b/sycl/test-e2e/format.py
index 44da0ef1c52c4..ff2f8e36ecfb7 100644
--- a/sycl/test-e2e/format.py
+++ b/sycl/test-e2e/format.py
@@ -244,6 +244,7 @@ def execute(self, test, litConfig):
             )
             sycl_target_opts += hip_arch_opts
             substitutions.append(("%{hip_arch_opts}", hip_arch_opts))
+            substitutions.append(("%{amd_arch}", test.config.amd_arch))
         if (
             "target-spir" in build_targets
             and "spirv-backend" in test.config.available_features
diff --git a/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp b/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp
index 5b97ae7a0cac9..4292342068187 100644
--- a/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp
+++ b/sycl/test/e2e_test_requirements/no_sycl_hpp_in_e2e_tests.cpp
@@ -6,7 +6,7 @@
 // CHECK-DAG: README.md
 // CHECK-DAG: lit.cfg.py
 //
-// CHECK-NUM-MATCHES: 26
+// CHECK-NUM-MATCHES: 28
 //
 // This test verifies that `<sycl/sycl.hpp>` isn't used in E2E tests. Instead,
 // fine-grained includes should used, see