diff --git a/buildbot/configure.py b/buildbot/configure.py
index f3a43857b7e1..5260a8c56336 100644
--- a/buildbot/configure.py
+++ b/buildbot/configure.py
@@ -23,6 +23,11 @@ def do_configure(args):
     libclc_amd_target_names = ';amdgcn--;amdgcn--amdhsa'
     libclc_nvidia_target_names = ';nvptx64--;nvptx64--nvidiacl'
 
+    sycl_enable_fusion = "OFF"
+    if not args.disable_fusion:
+        llvm_external_projects += ";sycl-fusion"
+        sycl_enable_fusion = "ON"
+
     if args.llvm_external_projects:
         llvm_external_projects += ";" + args.llvm_external_projects.replace(",", ";")
 
@@ -32,6 +37,7 @@ def do_configure(args):
     xpti_dir = os.path.join(abs_src_dir, "xpti")
     xptifw_dir = os.path.join(abs_src_dir, "xptifw")
     libdevice_dir = os.path.join(abs_src_dir, "libdevice")
+    fusion_dir = os.path.join(abs_src_dir, "sycl-fusion")
     llvm_targets_to_build = args.host_target
     llvm_enable_projects = 'clang;' + llvm_external_projects
     libclc_targets_to_build = ''
@@ -144,6 +150,7 @@ def do_configure(args):
         "-DXPTI_SOURCE_DIR={}".format(xpti_dir),
         "-DLLVM_EXTERNAL_XPTIFW_SOURCE_DIR={}".format(xptifw_dir),
         "-DLLVM_EXTERNAL_LIBDEVICE_SOURCE_DIR={}".format(libdevice_dir),
+        "-DLLVM_EXTERNAL_SYCL_FUSION_SOURCE_DIR={}".format(fusion_dir),
         "-DLLVM_ENABLE_PROJECTS={}".format(llvm_enable_projects),
         "-DLIBCLC_TARGETS_TO_BUILD={}".format(libclc_targets_to_build),
         "-DLIBCLC_GENERATE_REMANGLED_VARIANTS={}".format(libclc_gen_remangled_variants),
@@ -159,7 +166,8 @@ def do_configure(args):
         "-DLLVM_ENABLE_LLD={}".format(llvm_enable_lld),
         "-DXPTI_ENABLE_WERROR={}".format(xpti_enable_werror),
         "-DSYCL_CLANG_EXTRA_FLAGS={}".format(sycl_clang_extra_flags),
-        "-DSYCL_ENABLE_PLUGINS={}".format(';'.join(set(sycl_enabled_plugins)))
+        "-DSYCL_ENABLE_PLUGINS={}".format(';'.join(set(sycl_enabled_plugins))),
+        "-DSYCL_ENABLE_KERNEL_FUSION={}".format(sycl_enable_fusion)
     ]
 
     if args.l0_headers and args.l0_loader:
@@ -238,6 +246,7 @@ def main():
     parser.add_argument("--llvm-external-projects", help="Add external projects to build. Add as comma seperated list.")
     parser.add_argument("--ci-defaults", action="store_true", help="Enable default CI parameters")
     parser.add_argument("--enable-plugin", action='append', help="Enable SYCL plugin")
+    parser.add_argument("--disable-fusion", action="store_true", help="Disable the kernel fusion JIT compiler")
     args = parser.parse_args()
 
     print("args:{}".format(args))
diff --git a/sycl-fusion/CMakeLists.txt b/sycl-fusion/CMakeLists.txt
index 8be941097aba..cf1766d5bcda 100644
--- a/sycl-fusion/CMakeLists.txt
+++ b/sycl-fusion/CMakeLists.txt
@@ -9,6 +9,10 @@ set(SYCL_JIT_BASE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 # directories, similar to how clang/CMakeLists.txt does it.
 set(LLVM_SPIRV_INCLUDE_DIRS "${LLVM_MAIN_SRC_DIR}/../llvm-spirv/include")
 
-add_subdirectory(jit-compiler)
-add_subdirectory(passes)
-add_subdirectory(test)
+if(WIN32)
+  message(WARNING "Kernel fusion not yet supported on Windows")
+else(WIN32)
+  add_subdirectory(jit-compiler)
+  add_subdirectory(passes)
+  add_subdirectory(test)
+endif(WIN32)
diff --git a/sycl-fusion/jit-compiler/CMakeLists.txt b/sycl-fusion/jit-compiler/CMakeLists.txt
index 1b1951b01d5a..6f087e8801d2 100644
--- a/sycl-fusion/jit-compiler/CMakeLists.txt
+++ b/sycl-fusion/jit-compiler/CMakeLists.txt
@@ -39,6 +39,15 @@ target_link_libraries(sycl-fusion
   ${CMAKE_THREAD_LIBS_INIT}
 )
 
+if(NOT MSVC AND NOT APPLE)
+  # Manage symbol visibility through the linker to make sure no LLVM symbols
+  # are exported and confuse the drivers.
+  set(linker_script "${CMAKE_CURRENT_SOURCE_DIR}/ld-version-script.txt")
+  target_link_libraries(
+    sycl-fusion PRIVATE "-Wl,--version-script=${linker_script}")
+  set_target_properties(sycl-fusion PROPERTIES LINK_DEPENDS ${linker_script})
+endif()
+
 install(TARGETS sycl-fusion
   LIBRARY DESTINATION "lib${LLVM_LIBDIR_SUFFIX}" COMPONENT sycl-fusion
   RUNTIME DESTINATION "bin" COMPONENT sycl-fusion)
diff --git a/sycl-fusion/jit-compiler/ld-version-script.txt b/sycl-fusion/jit-compiler/ld-version-script.txt
new file mode 100644
index 000000000000..1532381962d4
--- /dev/null
+++ b/sycl-fusion/jit-compiler/ld-version-script.txt
@@ -0,0 +1,8 @@
+{
+  global:
+    /* Export everything from jit_compiler namespace */
+    _ZN12jit_compiler*;
+
+  local:
+    *;
+};
diff --git a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
index 547a7ae64848..36bfbe1f5d79 100644
--- a/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
+++ b/sycl-fusion/jit-compiler/lib/fusion/FusionPipeline.cpp
@@ -78,7 +78,7 @@ FusionPipeline::runFusionPasses(Module &Mod, SYCLModuleInfo &InputInfo,
     FPM.addPass(createFunctionToLoopPassAdaptor(IndVarSimplifyPass{}));
     LoopUnrollOptions UnrollOptions;
     FPM.addPass(LoopUnrollPass{UnrollOptions});
-    FPM.addPass(SROAPass{});
+    FPM.addPass(SROAPass{SROAOptions::ModifyCFG});
     // Run the InferAddressSpace pass to remove as many address-space casts
     // to/from generic address-space as possible, because these hinder
     // internalization.
@@ -94,11 +94,11 @@ FusionPipeline::runFusionPasses(Module &Mod, SYCLModuleInfo &InputInfo,
   // Run additional optimization passes after completing fusion.
   {
     FunctionPassManager FPM;
-    FPM.addPass(SROAPass{});
+    FPM.addPass(SROAPass{SROAOptions::ModifyCFG});
     FPM.addPass(SCCPPass{});
     FPM.addPass(InstCombinePass{});
     FPM.addPass(SimplifyCFGPass{});
-    FPM.addPass(SROAPass{});
+    FPM.addPass(SROAPass{SROAOptions::ModifyCFG});
     FPM.addPass(InstCombinePass{});
     FPM.addPass(SimplifyCFGPass{});
     FPM.addPass(ADCEPass{});
diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt
index fe9050e8f857..3e3923f46fbc 100644
--- a/sycl/CMakeLists.txt
+++ b/sycl/CMakeLists.txt
@@ -145,7 +145,12 @@ install(DIRECTORY ${OpenCL_INCLUDE_DIR}/CL
         COMPONENT OpenCL-Headers)
 
 # Option to enable online kernel fusion via a JIT compiler
-option(SYCL_ENABLE_KERNEL_FUSION "Enable kernel fusion via JIT compiler" OFF)
+option(SYCL_ENABLE_KERNEL_FUSION "Enable kernel fusion via JIT compiler" ON)
+if(SYCL_ENABLE_KERNEL_FUSION AND WIN32)
+  message(WARNING "Kernel fusion not yet supported on Windows")
+  set(SYCL_ENABLE_KERNEL_FUSION OFF CACHE 
+      BOOL "Kernel fusion not yet supported on Windows" FORCE)
+endif()
 
 # Needed for feature_test.hpp
 if ("cuda" IN_LIST SYCL_ENABLE_PLUGINS)
diff --git a/sycl/cmake/modules/AddSYCLUnitTest.cmake b/sycl/cmake/modules/AddSYCLUnitTest.cmake
index 9f5e6dc30a48..ea8135be0f09 100644
--- a/sycl/cmake/modules/AddSYCLUnitTest.cmake
+++ b/sycl/cmake/modules/AddSYCLUnitTest.cmake
@@ -56,6 +56,11 @@ macro(add_sycl_unittest test_dirname link_variant)
       OpenCL-Headers
       ${SYCL_LINK_LIBS}
     )
+
+  if(SYCL_ENABLE_KERNEL_FUSION)
+    target_link_libraries(${test_dirname} PRIVATE sycl-fusion)
+  endif(SYCL_ENABLE_KERNEL_FUSION)
+  
   target_include_directories(${test_dirname}
     PRIVATE SYSTEM
       ${sycl_inc_dir}
diff --git a/sycl/doc/GetStartedGuide.md b/sycl/doc/GetStartedGuide.md
index e48e16045845..e0cb806810d1 100644
--- a/sycl/doc/GetStartedGuide.md
+++ b/sycl/doc/GetStartedGuide.md
@@ -12,6 +12,7 @@ and a wide range of compute accelerators such as GPU and FPGA.
     - [Build DPC++ toolchain with support for HIP AMD](#build-dpc-toolchain-with-support-for-hip-amd)
     - [Build DPC++ toolchain with support for HIP NVIDIA](#build-dpc-toolchain-with-support-for-hip-nvidia)
     - [Build DPC++ toolchain with support for ESIMD CPU Emulation](#build-dpc-toolchain-with-support-for-esimd-emulator)
+    - [Build DPC++ toolchain with support for runtime kernel fusion](#build-dpc-toolchain-with-support-for-runtime-kernel-fusion)
     - [Build Doxygen documentation](#build-doxygen-documentation)
     - [Deployment](#deployment)
   - [Use DPC++ toolchain](#use-dpc-toolchain)
@@ -298,6 +299,16 @@ Enabling this flag requires following packages installed.
 Currently, this feature was tested and verified on Ubuntu 20.04
 environment.
 
+### Build DPC++ toolchain with support for runtime kernel fusion
+
+Support for the experimental SYCL extension for user-driven kernel fusion 
+at runtime is enabled by default. 
+
+To disable support for this feature, follow the instructions for the 
+Linux DPC++ toolchain, but add the `--disable-fusion` flag.
+
+Kernel fusion is currently not yet supported on the Windows platform.
+
 ### Build Doxygen documentation
 
 Building Doxygen documentation is similar to building the product itself. First,
diff --git a/sycl/include/sycl/detail/cg.hpp b/sycl/include/sycl/detail/cg.hpp
index 6751dcb54206..b835b8f24968 100644
--- a/sycl/include/sycl/detail/cg.hpp
+++ b/sycl/include/sycl/detail/cg.hpp
@@ -98,6 +98,10 @@ class CG {
 
   CGTYPE getType() { return MType; }
 
+  std::vector<std::vector<char>> &getArgsStorage() { return MArgsStorage; }
+
+  std::vector<detail::AccessorImplPtr> &getAccStorage() { return MAccStorage; }
+
   virtual ~CG() = default;
 
 private:
diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt
index ecd54a4ced59..9fabf9bc9270 100644
--- a/sycl/source/CMakeLists.txt
+++ b/sycl/source/CMakeLists.txt
@@ -87,6 +87,13 @@ function(add_sycl_rt_library LIB_NAME LIB_OBJ_NAME)
     PRIVATE OpenCL-Headers
   )
 
+  if(SYCL_ENABLE_KERNEL_FUSION)
+    target_link_libraries(${LIB_NAME} PRIVATE sycl-fusion)
+    target_link_libraries(${LIB_OBJ_NAME} PRIVATE sycl-fusion)
+    set_property(GLOBAL APPEND PROPERTY SYCL_TOOLCHAIN_INSTALL_COMPONENTS
+      sycl-fusion)
+  endif(SYCL_ENABLE_KERNEL_FUSION)
+
   find_package(Threads REQUIRED)
 
   target_link_libraries(${LIB_NAME}
@@ -139,6 +146,8 @@ set(SYCL_SOURCES
     "detail/handler_proxy.cpp"
     "detail/image_accessor_util.cpp"
     "detail/image_impl.cpp"
+    "detail/jit_compiler.cpp"
+    "detail/jit_device_binaries.cpp"
     "detail/kernel_impl.cpp"
     "detail/kernel_program_cache.cpp"
     "detail/memory_manager.cpp"
diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp
new file mode 100644
index 000000000000..e1ee8e1c9430
--- /dev/null
+++ b/sycl/source/detail/jit_compiler.cpp
@@ -0,0 +1,881 @@
+//==--- jit_compiler.cpp - SYCL runtime JIT compiler for kernel fusion -----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include <sycl/feature_test.hpp>
+#if SYCL_EXT_CODEPLAY_KERNEL_FUSION
+#include <KernelFusion.h>
+#include <detail/device_image_impl.hpp>
+#include <detail/jit_compiler.hpp>
+#include <detail/kernel_bundle_impl.hpp>
+#include <detail/kernel_impl.hpp>
+#include <detail/queue_impl.hpp>
+#include <detail/sycl_mem_obj_t.hpp>
+#include <sycl/detail/pi.hpp>
+#include <sycl/ext/codeplay/experimental/fusion_properties.hpp>
+#include <sycl/kernel_bundle.hpp>
+
+namespace sycl {
+__SYCL_INLINE_VER_NAMESPACE(_V1) {
+namespace detail {
+
+jit_compiler::jit_compiler() : MJITContext{new ::jit_compiler::JITContext{}} {}
+
+jit_compiler::~jit_compiler() = default;
+
+static ::jit_compiler::BinaryFormat
+translateBinaryImageFormat(pi::PiDeviceBinaryType Type) {
+  switch (Type) {
+  case PI_DEVICE_BINARY_TYPE_SPIRV:
+    return ::jit_compiler::BinaryFormat::SPIRV;
+  case PI_DEVICE_BINARY_TYPE_LLVMIR_BITCODE:
+    return ::jit_compiler::BinaryFormat::LLVM;
+  default:
+    throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
+                          "Format unsupported for JIT compiler");
+  }
+}
+
+static ::jit_compiler::ParameterKind
+translateArgType(kernel_param_kind_t Kind) {
+  using PK = ::jit_compiler::ParameterKind;
+  using kind = kernel_param_kind_t;
+  switch (Kind) {
+  case kind::kind_accessor:
+    return PK::Accessor;
+  case kind::kind_std_layout:
+    return PK::StdLayout;
+  case kind::kind_sampler:
+    return PK::Sampler;
+  case kind::kind_pointer:
+    return PK::Pointer;
+  case kind::kind_specialization_constants_buffer:
+    return PK::SpecConstBuffer;
+  case kind::kind_stream:
+    return PK::Stream;
+  case kind::kind_invalid:
+    return PK::Invalid;
+  }
+  return PK::Invalid;
+}
+
+enum class Promotion { None, Private, Local };
+
+struct PromotionInformation {
+  Promotion PromotionTarget;
+  unsigned KernelIndex;
+  unsigned ArgIndex;
+  Requirement *Definition;
+  NDRDescT NDRange;
+  size_t LocalSize;
+  std::vector<bool> UsedParams;
+};
+
+using PromotionMap = std::unordered_map<SYCLMemObjI *, PromotionInformation>;
+
+static inline void printPerformanceWarning(const std::string &Message) {
+  if (detail::SYCLConfig<detail::SYCL_RT_WARNING_LEVEL>::get() > 0) {
+    std::cerr << "WARNING: " << Message << "\n";
+  }
+}
+
+template <typename Obj> Promotion getPromotionTarget(const Obj &obj) {
+  auto Result = Promotion::None;
+  if (obj.template has_property<
+          ext::codeplay::experimental::property::promote_private>()) {
+    Result = Promotion::Private;
+  }
+  if (obj.template has_property<
+          ext::codeplay::experimental::property::promote_local>()) {
+    if (Result != Promotion::None) {
+      throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
+                            "Two contradicting promotion properties on the "
+                            "same buffer/accessor are not allowed.");
+    }
+    Result = Promotion::Local;
+  }
+  return Result;
+}
+
+static Promotion getInternalizationInfo(Requirement *Req) {
+  auto AccPromotion = getPromotionTarget(Req->MPropertyList);
+
+  auto *MemObj = static_cast<sycl::detail::SYCLMemObjT *>(Req->MSYCLMemObj);
+  if (MemObj->getType() != SYCLMemObjI::MemObjType::Buffer) {
+    // We currently do not support promotion on non-buffer memory objects (e.g.,
+    // images).
+    return Promotion::None;
+  }
+  Promotion BuffPromotion = getPromotionTarget(*MemObj);
+  if (AccPromotion != Promotion::None && BuffPromotion != Promotion::None &&
+      AccPromotion != BuffPromotion) {
+    throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
+                          "Contradicting promotion properties on accessor and "
+                          "underlying buffer are not allowed");
+  }
+  return (AccPromotion != Promotion::None) ? AccPromotion : BuffPromotion;
+}
+
+static std::optional<size_t> getLocalSize(NDRDescT NDRange, Requirement *Req,
+                                          Promotion Target) {
+  auto NumElementsMem = static_cast<SYCLMemObjT *>(Req->MSYCLMemObj)->size();
+  if (Target == Promotion::Private) {
+    auto NumWorkItems = NDRange.GlobalSize.size();
+    // For private internalization, the local size is
+    // (Number of elements in buffer)/(number of work-items)
+    return NumElementsMem / NumWorkItems;
+  } else if (Target == Promotion::Local) {
+    if (NDRange.LocalSize.size() == 0) {
+      // No work-group size provided, cannot calculate the local size
+      // and need to bail out.
+      return {};
+    }
+    auto NumWorkGroups = NDRange.GlobalSize.size() / NDRange.LocalSize.size();
+    // For local internalization, the local size is
+    // (Number of elements in buffer)/(number of work-groups)
+    return NumElementsMem / NumWorkGroups;
+  }
+  return 0;
+}
+
+static bool accessorEquals(Requirement *Req, Requirement *Other) {
+  return Req->MOffset == Other->MOffset &&
+         Req->MAccessRange == Other->MAccessRange &&
+         Req->MMemoryRange == Other->MMemoryRange &&
+         Req->MSYCLMemObj == Other->MSYCLMemObj && Req->MDims == Other->MDims &&
+         Req->MElemSize == Other->MElemSize &&
+         Req->MOffsetInBytes == Other->MOffsetInBytes &&
+         Req->MIsSubBuffer == Other->MIsSubBuffer;
+}
+
+static void resolveInternalization(ArgDesc &Arg, unsigned KernelIndex,
+                                   unsigned ArgFunctionIndex, NDRDescT NDRange,
+                                   PromotionMap &Promotions) {
+  assert(Arg.MType == kernel_param_kind_t::kind_accessor);
+
+  Requirement *Req = static_cast<Requirement *>(Arg.MPtr);
+
+  auto ThisPromotionTarget = getInternalizationInfo(Req);
+  auto ThisLocalSize = getLocalSize(NDRange, Req, ThisPromotionTarget);
+
+  if (Promotions.count(Req->MSYCLMemObj)) {
+    // We previously encountered an accessor for the same buffer.
+    auto &PreviousDefinition = Promotions.at(Req->MSYCLMemObj);
+
+    switch (ThisPromotionTarget) {
+    case Promotion::None: {
+      if (PreviousDefinition.PromotionTarget != Promotion::None) {
+        printPerformanceWarning(
+            "Deactivating previously specified promotion, because this "
+            "accessor does not specify promotion");
+        PreviousDefinition.PromotionTarget = Promotion::None;
+      }
+      return;
+    }
+    case Promotion::Local: {
+      if (PreviousDefinition.PromotionTarget == Promotion::None) {
+        printPerformanceWarning(
+            "Not performing specified local promotion, due to previous "
+            "mismatch or because previous accessor specified no promotion");
+        return;
+      }
+      if (!ThisLocalSize.has_value()) {
+        printPerformanceWarning("Work-group size for local promotion not "
+                                "specified, not performing internalization");
+        PreviousDefinition.PromotionTarget = Promotion::None;
+        return;
+      }
+      if (PreviousDefinition.PromotionTarget == Promotion::Private) {
+        printPerformanceWarning(
+            "Overriding previous private promotion with local promotion");
+        // Recompute the local size for the previous definition with adapted
+        // promotion target.
+        auto NewPrevLocalSize =
+            getLocalSize(PreviousDefinition.NDRange,
+                         PreviousDefinition.Definition, Promotion::Local);
+
+        if (!NewPrevLocalSize.has_value()) {
+          printPerformanceWarning(
+              "Not performing specified local promotion because previous "
+              "kernels did not specify a local size");
+          PreviousDefinition.PromotionTarget = Promotion::None;
+          return;
+        }
+
+        PreviousDefinition.LocalSize = NewPrevLocalSize.value();
+        PreviousDefinition.PromotionTarget = Promotion::Local;
+      }
+      if (PreviousDefinition.LocalSize != ThisLocalSize.value()) {
+        printPerformanceWarning("Not performing specified local promotion due "
+                                "to work-group size mismatch");
+        PreviousDefinition.PromotionTarget = Promotion::None;
+        return;
+      }
+      if (!accessorEquals(Req, PreviousDefinition.Definition)) {
+        printPerformanceWarning("Not performing specified promotion, due to "
+                                "accessor parameter mismatch");
+        PreviousDefinition.PromotionTarget = Promotion::None;
+        return;
+      }
+      return;
+    }
+    case Promotion::Private: {
+      if (PreviousDefinition.PromotionTarget == Promotion::None) {
+        printPerformanceWarning(
+            "Not performing specified private promotion, due to previous "
+            "mismatch or because previous accessor specified no promotion");
+        return;
+      }
+
+      if (PreviousDefinition.PromotionTarget == Promotion::Local) {
+        // Recompute the local size with adapted promotion target.
+        auto ThisLocalSize = getLocalSize(NDRange, Req, Promotion::Local);
+        if (!ThisLocalSize.has_value()) {
+          printPerformanceWarning("Work-group size for local promotion not "
+                                  "specified, not performing internalization");
+          PreviousDefinition.PromotionTarget = Promotion::None;
+          return;
+        }
+
+        if (PreviousDefinition.LocalSize != ThisLocalSize.value()) {
+          printPerformanceWarning(
+              "Not performing specified local promotion due "
+              "to work-group size mismatch");
+          PreviousDefinition.PromotionTarget = Promotion::None;
+          return;
+        }
+
+        if (!accessorEquals(Req, PreviousDefinition.Definition)) {
+          printPerformanceWarning("Not performing local promotion, due to "
+                                  "accessor parameter mismatch");
+          PreviousDefinition.PromotionTarget = Promotion::None;
+          return;
+        }
+
+        printPerformanceWarning(
+            "Performing local internalization instead, because previous "
+            "accessor specified local promotion");
+        return;
+      }
+
+      // Previous accessors also specified private promotion.
+      if (PreviousDefinition.LocalSize != ThisLocalSize.value()) {
+        printPerformanceWarning(
+            "Not performing specified private promotion due "
+            "to work-group size mismatch");
+        PreviousDefinition.PromotionTarget = Promotion::None;
+        return;
+      }
+      if (!accessorEquals(Req, PreviousDefinition.Definition)) {
+        printPerformanceWarning("Not performing specified promotion, due to "
+                                "accessor parameter mismatch");
+        PreviousDefinition.PromotionTarget = Promotion::None;
+        return;
+      }
+      return;
+    }
+    }
+  } else {
+    if (ThisPromotionTarget == Promotion::Local && !ThisLocalSize.has_value()) {
+      printPerformanceWarning("Work-group size for local promotion not "
+                              "specified, not performing internalization");
+      ThisPromotionTarget = Promotion::None;
+      ThisLocalSize = 0;
+    }
+    assert(ThisLocalSize.has_value());
+    Promotions.emplace(Req->MSYCLMemObj,
+                       PromotionInformation{ThisPromotionTarget, KernelIndex,
+                                            ArgFunctionIndex, Req, NDRange,
+                                            ThisLocalSize.value(),
+                                            std::vector<bool>()});
+  }
+}
+
+// Identify a parameter by the argument description, the kernel index and the
+// parameter index in that kernel.
+struct Param {
+  ArgDesc Arg;
+  unsigned KernelIndex;
+  unsigned ArgIndex;
+  bool Used;
+  Param(ArgDesc Argument, unsigned KernelIdx, unsigned ArgIdx, bool InUse)
+      : Arg{Argument}, KernelIndex{KernelIdx}, ArgIndex{ArgIdx}, Used{InUse} {}
+};
+
+using ParamList = std::vector<Param>;
+
+using ParamIterator = std::vector<Param>::iterator;
+
+std::vector<Param>::const_iterator
+detectIdenticalParameter(std::vector<Param> &Params, ArgDesc Arg) {
+  for (auto I = Params.begin(); I < Params.end(); ++I) {
+    // Two arguments of different type can never be identical.
+    if (I->Arg.MType == Arg.MType) {
+      if (Arg.MType == kernel_param_kind_t::kind_pointer ||
+          Arg.MType == kernel_param_kind_t::kind_std_layout) {
+        // Compare size and, if the size is identical, the content byte-by-byte.
+        if ((Arg.MSize == I->Arg.MSize) &&
+            std::memcmp(Arg.MPtr, I->Arg.MPtr, Arg.MSize) == 0) {
+          return I;
+        }
+      } else if (Arg.MType == kernel_param_kind_t::kind_accessor) {
+        Requirement *Req = static_cast<Requirement *>(Arg.MPtr);
+        Requirement *Other = static_cast<Requirement *>(I->Arg.MPtr);
+        if (accessorEquals(Req, Other)) {
+          return I;
+        }
+      }
+    }
+  }
+  return Params.end();
+}
+
+template <typename T, typename F = typename std::remove_const_t<
+                          typename std::remove_reference_t<T>>>
+F *storePlainArg(std::vector<std::vector<char>> &ArgStorage, T &&Arg) {
+  ArgStorage.emplace_back(sizeof(T));
+  auto Storage = reinterpret_cast<F *>(ArgStorage.back().data());
+  *Storage = Arg;
+  return Storage;
+}
+
+void *storePlainArgRaw(std::vector<std::vector<char>> &ArgStorage, void *ArgPtr,
+                       size_t ArgSize) {
+  ArgStorage.emplace_back(ArgSize);
+  void *Storage = ArgStorage.back().data();
+  std::memcpy(Storage, ArgPtr, ArgSize);
+  return Storage;
+}
+
+static ParamIterator preProcessArguments(
+    std::vector<std::vector<char>> &ArgStorage, ParamIterator Arg,
+    PromotionMap &PromotedAccs,
+    std::vector<::jit_compiler::ParameterInternalization> &InternalizeParams,
+    std::vector<::jit_compiler::JITConstant> &JITConstants,
+    ParamList &NonIdenticalParams,
+    ::jit_compiler::ParamIdentList &ParamIdentities) {
+
+  // Unused arguments are still in the list at this point (because we
+  // need them for accessor handling), but there's not pre-processing
+  // that needs to be done.
+  if (!Arg->Used) {
+    return ++Arg;
+  }
+
+  if (Arg->Arg.MType == kernel_param_kind_t::kind_pointer) {
+    // Pointer arguments are only stored in the kernel functor object, which
+    // will go out-of-scope before we execute the fused kernel. Therefore, we
+    // need to copy the pointer (not the memory it's pointing to) to a permanent
+    // location and update the argument.
+    Arg->Arg.MPtr =
+        storePlainArg(ArgStorage, *static_cast<void **>(Arg->Arg.MPtr));
+  }
+  if (Arg->Arg.MType == kernel_param_kind_t::kind_std_layout) {
+    // Standard layout arguments are only stored in the kernel functor object,
+    // which will go out-of-scope before we execute the fused kernel. Therefore,
+    // we need to copy the argument to a permant location and update the
+    // argument.
+    Arg->Arg.MPtr = storePlainArgRaw(ArgStorage, Arg->Arg.MPtr, Arg->Arg.MSize);
+  }
+  // First check if there's already another parameter with identical
+  // value.
+  auto Identical = detectIdenticalParameter(NonIdenticalParams, Arg->Arg);
+  if (Identical != NonIdenticalParams.end()) {
+    ::jit_compiler::Parameter ThisParam{Arg->KernelIndex, Arg->ArgIndex};
+    ::jit_compiler::Parameter IdenticalParam{Identical->KernelIndex,
+                                             Identical->ArgIndex};
+    ::jit_compiler::ParameterIdentity Identity{ThisParam, IdenticalParam};
+    ParamIdentities.push_back(Identity);
+    return ++Arg;
+  }
+
+  if (Arg->Arg.MType == kernel_param_kind_t::kind_accessor) {
+    // Get local and private promotion information from accessors.
+    Requirement *Req = static_cast<Requirement *>(Arg->Arg.MPtr);
+    auto &Internalization = PromotedAccs.at(Req->MSYCLMemObj);
+    auto PromotionTarget = Internalization.PromotionTarget;
+    if (PromotionTarget == Promotion::Private ||
+        PromotionTarget == Promotion::Local) {
+      // The accessor should be promoted.
+      if (Internalization.KernelIndex == Arg->KernelIndex &&
+          Internalization.ArgIndex == Arg->ArgIndex) {
+        // This is the first accessor for this buffer that should be
+        // internalized.
+        InternalizeParams.emplace_back(
+            ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex},
+            (PromotionTarget == Promotion::Private)
+                ? ::jit_compiler::Internalization::Private
+                : ::jit_compiler::Internalization::Local,
+            Internalization.LocalSize);
+        // If an accessor will be promoted, i.e., if it has the promotion
+        // property attached to it, the next three arguments, that are
+        // associated with the accessor (access range, memory range, offset),
+        // must not participate in identical parameter detection or constant
+        // propagation, because their values will change if promotion happens.
+        // Therefore, we can just skip them here, but we need to remember which
+        // of them are used.
+        for (unsigned I = 0; I < 4; ++I) {
+          Internalization.UsedParams.push_back(Arg->Used);
+          ++Arg;
+        }
+      } else {
+        // We have previously encountered an accessor the same buffer, which
+        // should be internalized. We can add parameter identities for the
+        // accessor argument and the next three arguments (range, memory range
+        // and offset, if they are used).
+        unsigned Increment = 0;
+        for (unsigned I = 0; I < 4; ++I) {
+          // If the argument is used in both cases, i.e., on the original
+          // accessor to be internalized, and this one, we can insert a
+          // parameter identity.
+          if (Arg->Used && Internalization.UsedParams[I]) {
+            ::jit_compiler::Parameter ThisParam{Arg->KernelIndex,
+                                                Arg->ArgIndex};
+            ::jit_compiler::Parameter IdenticalParam{
+                Internalization.KernelIndex,
+                Internalization.ArgIndex + Increment};
+            ::jit_compiler::ParameterIdentity Identity{ThisParam,
+                                                       IdenticalParam};
+            ParamIdentities.push_back(Identity);
+          }
+          if (Internalization.UsedParams[I]) {
+            ++Increment;
+          }
+          ++Arg;
+        }
+      }
+      return Arg;
+    } else {
+      // The accessor will not be promoted, so it can participate in identical
+      // parameter detection.
+      NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex,
+                                      true);
+      return ++Arg;
+    }
+  } else if (Arg->Arg.MType == kernel_param_kind_t::kind_std_layout) {
+    // No identical parameter exists, so add this to the list.
+    NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex,
+                                    true);
+    // Propagate values of scalar parameters as constants to the JIT
+    // compiler.
+    JITConstants.emplace_back(
+        ::jit_compiler::Parameter{Arg->KernelIndex, Arg->ArgIndex},
+        Arg->Arg.MPtr, Arg->Arg.MSize);
+    return ++Arg;
+  } else if (Arg->Arg.MType == kernel_param_kind_t::kind_pointer) {
+    // No identical parameter exists, so add this to the list.
+    NonIdenticalParams.emplace_back(Arg->Arg, Arg->KernelIndex, Arg->ArgIndex,
+                                    true);
+    return ++Arg;
+  }
+  return ++Arg;
+}
+
+static void
+updatePromotedArgs(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo,
+                   NDRDescT NDRange, std::vector<ArgDesc> &FusedArgs,
+                   std::vector<std::vector<char>> &FusedArgStorage) {
+  auto &ArgUsageInfo = FusedKernelInfo.Args.UsageMask;
+  assert(ArgUsageInfo.size() == FusedArgs.size());
+  for (size_t ArgIndex = 0; ArgIndex < ArgUsageInfo.size();) {
+    bool PromotedToPrivate =
+        (ArgUsageInfo[ArgIndex] & ::jit_compiler::ArgUsage::PromotedPrivate);
+    bool PromotedToLocal =
+        (ArgUsageInfo[ArgIndex] & ::jit_compiler::ArgUsage::PromotedLocal);
+    if (PromotedToLocal || PromotedToPrivate) {
+      // For each internalized accessor, we need to override four arguments
+      // (see 'addArgsForGlobalAccessor' in handler.cpp for reference), i.e.,
+      // the pointer itself, plus twice the range and the offset.
+      auto &OldArgDesc = FusedArgs[ArgIndex];
+      assert(OldArgDesc.MType == kernel_param_kind_t::kind_accessor);
+      auto *Req = static_cast<Requirement *>(OldArgDesc.MPtr);
+
+      // The stored args are all three-dimensional, but depending on the
+      // actual number of dimensions of the accessor, only a part of that
+      // argument is later on passed to the kernel.
+      const size_t SizeAccField =
+          sizeof(size_t) * (Req->MDims == 0 ? 1 : Req->MDims);
+      // Compute the local size and use it for the range parameters.
+      auto LocalSize = getLocalSize(NDRange, Req,
+                                    (PromotedToPrivate) ? Promotion::Private
+                                                        : Promotion::Local);
+      range<3> AccessRange{1, 1, LocalSize.value()};
+      auto *RangeArg = storePlainArg(FusedArgStorage, AccessRange);
+      // Use all-zero as the offset
+      id<3> AcessOffset{0, 0, 0};
+      auto *OffsetArg = storePlainArg(FusedArgStorage, AcessOffset);
+
+      // Override the arguments.
+      // 1. Override the pointer with a std-layout argument with 'nullptr' as
+      // value. handler.cpp does the same for local accessors.
+      int SizeInBytes = Req->MElemSize * LocalSize.value();
+      FusedArgs[ArgIndex] =
+          ArgDesc{kernel_param_kind_t::kind_std_layout, nullptr, SizeInBytes,
+                  static_cast<int>(ArgIndex)};
+      ++ArgIndex;
+      // 2. Access Range
+      FusedArgs[ArgIndex] =
+          ArgDesc{kernel_param_kind_t::kind_std_layout, RangeArg,
+                  static_cast<int>(SizeAccField), static_cast<int>(ArgIndex)};
+      ++ArgIndex;
+      // 3. Memory Range
+      FusedArgs[ArgIndex] =
+          ArgDesc{kernel_param_kind_t::kind_std_layout, RangeArg,
+                  static_cast<int>(SizeAccField), static_cast<int>(ArgIndex)};
+      ++ArgIndex;
+      // 4. Offset
+      FusedArgs[ArgIndex] =
+          ArgDesc{kernel_param_kind_t::kind_std_layout, OffsetArg,
+                  static_cast<int>(SizeAccField), static_cast<int>(ArgIndex)};
+      ++ArgIndex;
+    } else {
+      ++ArgIndex;
+    }
+  }
+}
+
+std::unique_ptr<detail::CG>
+jit_compiler::fuseKernels(QueueImplPtr Queue,
+                          std::vector<ExecCGCommand *> &InputKernels,
+                          const property_list &PropList) {
+  // Retrieve the device binary from each of the input
+  // kernels to hand them over to the JIT compiler.
+  std::vector<::jit_compiler::SYCLKernelInfo> InputKernelInfo;
+  std::vector<std::string> InputKernelNames;
+  // Collect argument information from all input kernels.
+  std::vector<std::vector<char>> ArgsStorage;
+  std::vector<detail::AccessorImplPtr> AccStorage;
+  std::vector<Requirement *> Requirements;
+  std::vector<detail::EventImplPtr> Events;
+  NDRDescT NDRDesc;
+  unsigned KernelIndex = 0;
+  ParamList FusedParams;
+  PromotionMap PromotedAccs;
+  // TODO(Lukas, ONNX-399): Collect information about streams and auxiliary
+  // resources (which contain reductions) and figure out how to fuse them.
+  for (auto &RawCmd : InputKernels) {
+    auto *KernelCmd = static_cast<ExecCGCommand *>(RawCmd);
+    auto &CG = KernelCmd->getCG();
+    assert(CG.getType() == CG::Kernel);
+    auto *KernelCG = static_cast<CGExecKernel *>(&CG);
+
+    auto KernelName = KernelCG->MKernelName;
+    if (KernelName.empty()) {
+      printPerformanceWarning(
+          "Cannot fuse kernel with invalid kernel function name");
+      return nullptr;
+    }
+    const RTDeviceBinaryImage *DeviceImage = nullptr;
+    RT::PiProgram Program = nullptr;
+    if (KernelCG->getKernelBundle() != nullptr) {
+      // Retrieve the device image from the kernel bundle.
+      auto KernelBundle = KernelCG->getKernelBundle();
+      kernel_id KernelID =
+          detail::ProgramManager::getInstance().getSYCLKernelID(KernelName);
+
+      auto SyclKernel = detail::getSyclObjImpl(
+          KernelBundle->get_kernel(KernelID, KernelBundle));
+
+      DeviceImage = SyclKernel->getDeviceImage()->get_bin_image_ref();
+      Program = SyclKernel->getDeviceImage()->get_program_ref();
+    } else if (KernelCG->MSyclKernel != nullptr) {
+      DeviceImage =
+          KernelCG->MSyclKernel->getDeviceImage()->get_bin_image_ref();
+      Program = KernelCG->MSyclKernel->getDeviceImage()->get_program_ref();
+    } else {
+      auto ContextImpl = Queue->getContextImplPtr();
+      auto Context = detail::createSyclObjFromImpl<context>(ContextImpl);
+      auto DeviceImpl = Queue->getDeviceImplPtr();
+      auto Device = detail::createSyclObjFromImpl<device>(DeviceImpl);
+      DeviceImage = &detail::ProgramManager::getInstance().getDeviceImage(
+          KernelCG->MOSModuleHandle, KernelName, Context, Device);
+      Program = detail::ProgramManager::getInstance().createPIProgram(
+          *DeviceImage, Context, Device);
+    }
+    if (!DeviceImage || !Program) {
+      printPerformanceWarning("No suitable IR available for fusion");
+      return nullptr;
+    }
+    ProgramManager::KernelArgMask EliminatedArgs;
+    if (Program && (KernelCG->MSyclKernel == nullptr ||
+                    !KernelCG->MSyclKernel->isCreatedFromSource())) {
+      EliminatedArgs =
+          detail::ProgramManager::getInstance().getEliminatedKernelArgMask(
+              KernelCG->MOSModuleHandle, Program, KernelName);
+    }
+
+    // Collect information about the arguments of this kernel.
+
+    // Might need to sort the arguments in case they are not already sorted,
+    // see also the similar code in commands.cpp.
+    auto Args = KernelCG->MArgs;
+    std::sort(Args.begin(), Args.end(), [](const ArgDesc &A, const ArgDesc &B) {
+      return A.MIndex < B.MIndex;
+    });
+
+    ::jit_compiler::SYCLArgumentDescriptor ArgDescriptor;
+    size_t ArgIndex = 0;
+    // The kernel function in SPIR-V will only have the non-eliminated
+    // arguments, so keep track of this "actual" argument index.
+    unsigned ArgFunctionIndex = 0;
+    for (auto &Arg : Args) {
+      ArgDescriptor.Kinds.push_back(translateArgType(Arg.MType));
+      // DPC++ internally uses 'true' to indicate that an argument has been
+      // eliminated, while the JIT compiler uses 'true' to indicate an
+      // argument is used. Translate this here.
+      bool Eliminated = !EliminatedArgs.empty() && EliminatedArgs[ArgIndex++];
+      ArgDescriptor.UsageMask.emplace_back(!Eliminated);
+
+      // If the argument has not been eliminated, i.e., is still present on
+      // the kernel function in LLVM-IR/SPIR-V, collect information about the
+      // argument for performance optimizations in the JIT compiler.
+      if (!Eliminated) {
+        if (Arg.MType == kernel_param_kind_t::kind_accessor) {
+          resolveInternalization(Arg, KernelIndex, ArgFunctionIndex,
+                                 KernelCG->MNDRDesc, PromotedAccs);
+        }
+        FusedParams.emplace_back(Arg, KernelIndex, ArgFunctionIndex, true);
+        ++ArgFunctionIndex;
+      } else {
+        FusedParams.emplace_back(Arg, KernelIndex, 0, false);
+      }
+    }
+
+    // TODO(Lukas, ONNX-399): Check for the correct kernel bundle state of the
+    // device image?
+    auto &RawDeviceImage = DeviceImage->getRawData();
+    auto DeviceImageSize = static_cast<size_t>(RawDeviceImage.BinaryEnd -
+                                               RawDeviceImage.BinaryStart);
+    // Set 0 as the number of address bits, because the JIT compiler can set
+    // this field based on information from SPIR-V/LLVM module's data-layout.
+    auto BinaryImageFormat =
+        translateBinaryImageFormat(DeviceImage->getFormat());
+    if (BinaryImageFormat == ::jit_compiler::BinaryFormat::INVALID) {
+      printPerformanceWarning("No suitable IR available for fusion");
+      return nullptr;
+    }
+    ::jit_compiler::SYCLKernelBinaryInfo BinInfo{
+        translateBinaryImageFormat(DeviceImage->getFormat()), 0,
+        RawDeviceImage.BinaryStart, DeviceImageSize};
+
+    InputKernelInfo.emplace_back(KernelName, ArgDescriptor, BinInfo);
+    InputKernelNames.push_back(KernelName);
+
+    // Collect information for the fused kernel
+
+    // TODO(Lukas, ONNX-399): Currently assuming the NDRDesc is identical for
+    // all input kernels. Actually verify this here or in the graph_builder.
+    auto &CurrentNDR = KernelCG->MNDRDesc;
+    if (CurrentNDR.GlobalSize[0] == 0 && CurrentNDR.NumWorkGroups[0] != 0) {
+      // Some overloads of parallel_for_work_group only specify the number of
+      // work-groups, so this can be used to identify hierarchical parallel
+      // kernels, which are not supported by fusion for now.
+      printPerformanceWarning(
+          "Cannot fuse kernel with hierarchical parallelism");
+      return nullptr;
+      // Not all overloads of parallel_for_work_group only specify the number of
+      // work-groups, so the above mechanism might not detect all hierarchical
+      // parallelism.
+      // TODO(Lukas, CRD-6): Find a more reliable way to detect hierarchical
+      // parallelism.
+    }
+    if (KernelIndex == 0) {
+      NDRDesc = CurrentNDR;
+    } else {
+      if (CurrentNDR.Dims != NDRDesc.Dims) {
+        printPerformanceWarning(
+            "Cannot fuse kernels with different dimensionality");
+        return nullptr;
+      }
+      if (CurrentNDR.GlobalOffset != NDRDesc.GlobalOffset) {
+        printPerformanceWarning(
+            "Cannot fuse kernels with different global offset");
+        return nullptr;
+      }
+      if (CurrentNDR.GlobalSize != NDRDesc.GlobalSize) {
+        printPerformanceWarning(
+            "Cannot fuse kerneles with different global size");
+        return nullptr;
+      }
+      if (CurrentNDR.LocalSize[0] != 0 &&
+          CurrentNDR.LocalSize != NDRDesc.LocalSize) {
+        printPerformanceWarning(
+            "Cannot fuse kernels with different local size");
+        return nullptr;
+      }
+    }
+    // We need to copy the storages here. The input CGs might be eliminated
+    // before the fused kernel gets executed, so we need to copy the storages
+    // here to make sure the arguments don't die on us before executing the
+    // fused kernel.
+    ArgsStorage.insert(ArgsStorage.end(), KernelCG->getArgsStorage().begin(),
+                       KernelCG->getArgsStorage().end());
+    AccStorage.insert(AccStorage.end(), KernelCG->getAccStorage().begin(),
+                      KernelCG->getAccStorage().end());
+    // TODO(Lukas, ONNX-399): Does the MSharedPtrStorage contain any
+    // information about actual shared pointers beside the kernel bundle and
+    // handler impl? If yes, we might need to copy it here.
+    Requirements.insert(Requirements.end(), KernelCG->MRequirements.begin(),
+                        KernelCG->MRequirements.end());
+    Events.insert(Events.end(), KernelCG->MEvents.begin(),
+                  KernelCG->MEvents.end());
+    ++KernelIndex;
+  }
+
+  // Pre-process the arguments, to detect identical parameters or arguments that
+  // can be constant-propagated by the JIT compiler.
+  std::vector<::jit_compiler::ParameterInternalization> InternalizeParams;
+  std::vector<::jit_compiler::JITConstant> JITConstants;
+  ::jit_compiler::ParamIdentList ParamIdentities;
+  ParamList NonIdenticalParameters;
+  for (auto PI = FusedParams.begin(); PI != FusedParams.end();) {
+    PI = preProcessArguments(ArgsStorage, PI, PromotedAccs, InternalizeParams,
+                             JITConstants, NonIdenticalParameters,
+                             ParamIdentities);
+  }
+
+  // Retrieve barrier flags.
+  int BarrierFlags =
+      (PropList
+           .has_property<ext::codeplay::experimental::property::no_barriers>())
+          ? -1
+          : 3;
+
+  static size_t FusedKernelNameIndex = 0;
+  std::stringstream FusedKernelName;
+  FusedKernelName << "fused_" << FusedKernelNameIndex++;
+  ::jit_compiler::Config JITConfig;
+  bool DebugEnabled =
+      detail::SYCLConfig<detail::SYCL_RT_WARNING_LEVEL>::get() > 0;
+  JITConfig.set<::jit_compiler::option::JITEnableVerbose>(DebugEnabled);
+  // TODO: Enable caching in a separate PR.
+
+  auto FusionResult = ::jit_compiler::KernelFusion::fuseKernels(
+      *MJITContext, std::move(JITConfig), InputKernelInfo, InputKernelNames,
+      FusedKernelName.str(), ParamIdentities, BarrierFlags, InternalizeParams,
+      JITConstants);
+
+  if (FusionResult.failed()) {
+    if (DebugEnabled) {
+      std::cerr
+          << "ERROR: JIT compilation for kernel fusion failed with message:\n"
+          << FusionResult.getErrorMessage() << "\n";
+    }
+    return nullptr;
+  }
+
+  auto &FusedKernelInfo = FusionResult.getKernelInfo();
+
+  std::vector<ArgDesc> FusedArgs;
+  int FusedArgIndex = 0;
+  for (auto &Param : FusedParams) {
+    // Add to the argument list of the fused kernel, but with the correct
+    // new index in the fused kernel.
+    auto &Arg = Param.Arg;
+    FusedArgs.emplace_back(Arg.MType, Arg.MPtr, Arg.MSize, FusedArgIndex++);
+  }
+
+  // Update the kernel arguments for internalized accessors.
+  updatePromotedArgs(FusedKernelInfo, NDRDesc, FusedArgs, ArgsStorage);
+
+  if (!FusionResult.cached()) {
+    auto PIDeviceBinaries = createPIDeviceBinary(FusedKernelInfo);
+    detail::ProgramManager::getInstance().addImages(PIDeviceBinaries);
+  } else if (DebugEnabled) {
+    std::cerr << "INFO: Re-using existing device binary for fused kernel\n";
+  }
+
+  // Create a kernel bundle for the fused kernel.
+  // Kernel bundles are stored in the CG as one of the "extended" members.
+  auto FusedKernelId = detail::ProgramManager::getInstance().getSYCLKernelID(
+      FusedKernelInfo.Name);
+  std::vector<std::shared_ptr<const void>> RawExtendedMembers;
+
+  std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr =
+      detail::getSyclObjImpl(get_kernel_bundle<bundle_state::executable>(
+          Queue->get_context(), {Queue->get_device()}, {FusedKernelId}));
+
+  std::unique_ptr<detail::CG> FusedCG;
+  FusedCG.reset(new detail::CGExecKernel(
+      NDRDesc, nullptr, nullptr, std::move(KernelBundleImplPtr),
+      std::move(ArgsStorage), std::move(AccStorage),
+      std::move(RawExtendedMembers), std::move(Requirements), std::move(Events),
+      std::move(FusedArgs), FusedKernelInfo.Name, OSUtil::DummyModuleHandle, {},
+      {}, CG::CGTYPE::Kernel));
+  return FusedCG;
+}
+
+pi_device_binaries jit_compiler::createPIDeviceBinary(
+    const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo) {
+
+  DeviceBinaryContainer Binary;
+
+  // Create an offload entry for the fused kernel.
+  // It seems to be OK to set zero for most of the information here, at least
+  // that is the case for compiled SPIR-V binaries.
+  OffloadEntryContainer Entry{FusedKernelInfo.Name, nullptr, 0, 0, 0};
+  Binary.addOffloadEntry(std::move(Entry));
+
+  // Create a property entry for the argument usage mask for the fused kernel.
+  auto ArgMask = encodeArgUsageMask(FusedKernelInfo.Args.UsageMask);
+  PropertyContainer ArgMaskProp{FusedKernelInfo.Name, ArgMask.data(),
+                                ArgMask.size(),
+                                pi_property_type::PI_PROPERTY_TYPE_BYTE_ARRAY};
+
+  // Create a property set for the argument usage masks of all kernels
+  // (currently only one).
+  PropertySetContainer ArgMaskPropSet{
+      __SYCL_PI_PROPERTY_SET_KERNEL_PARAM_OPT_INFO};
+
+  ArgMaskPropSet.addProperty(std::move(ArgMaskProp));
+
+  Binary.addProperty(std::move(ArgMaskPropSet));
+
+  DeviceBinariesCollection Collection;
+  Collection.addDeviceBinary(std::move(Binary),
+                             FusedKernelInfo.BinaryInfo.BinaryStart,
+                             FusedKernelInfo.BinaryInfo.BinarySize,
+                             FusedKernelInfo.BinaryInfo.AddressBits);
+
+  JITDeviceBinaries.push_back(std::move(Collection));
+  return JITDeviceBinaries.back().getPIDeviceStruct();
+}
+
+std::vector<uint8_t> jit_compiler::encodeArgUsageMask(
+    const ::jit_compiler::ArgUsageMask &Mask) const {
+  // This must match the decoding logic in program_manager.cpp.
+  constexpr uint64_t NBytesForSize = 8;
+  constexpr uint64_t NBitsInElement = 8;
+  uint64_t Size = static_cast<uint64_t>(Mask.size());
+  // Round the size to the next multiple of 8
+  uint64_t RoundedSize =
+      ((Size + (NBitsInElement - 1)) & (~(NBitsInElement - 1)));
+  std::vector<uint8_t> Encoded((RoundedSize / NBitsInElement) + NBytesForSize,
+                               0u);
+  // First encode the size of the actual mask
+  for (size_t i = 0; i < NBytesForSize; ++i) {
+    uint8_t Byte =
+        static_cast<uint8_t>((RoundedSize >> i * NBitsInElement) & 0xFF);
+    Encoded[i] = Byte;
+  }
+  // Encode the actual mask bit-wise
+  for (size_t i = 0; i < Size; ++i) {
+    // DPC++ internally uses 'true' to indicate that an argument has been
+    // eliminated, while the JIT compiler uses 'true' to indicate an argument
+    // is used. Translate this here.
+    if (!(Mask[i] & ::jit_compiler::ArgUsage::Used)) {
+      uint8_t &Byte = Encoded[NBytesForSize + (i / NBitsInElement)];
+      Byte |= static_cast<uint8_t>((1 << (i % NBitsInElement)));
+    }
+  }
+  return Encoded;
+}
+
+} // namespace detail
+} // __SYCL_INLINE_VER_NAMESPACE(_V1)
+} // namespace sycl
+
+#endif // SYCL_EXT_CODEPLAY_KERNEL_FUSION
diff --git a/sycl/source/detail/jit_compiler.hpp b/sycl/source/detail/jit_compiler.hpp
new file mode 100644
index 000000000000..522c0749ef75
--- /dev/null
+++ b/sycl/source/detail/jit_compiler.hpp
@@ -0,0 +1,62 @@
+//==--- jit_compiler.hpp - SYCL runtime JIT compiler for kernel fusion -----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <detail/jit_device_binaries.hpp>
+#include <detail/scheduler/commands.hpp>
+#include <detail/scheduler/scheduler.hpp>
+
+namespace jit_compiler {
+class JITContext;
+struct SYCLKernelInfo;
+using ArgUsageMask = std::vector<unsigned char>;
+} // namespace jit_compiler
+
+struct pi_device_binaries_struct;
+struct _pi_offload_entry_struct;
+
+namespace sycl {
+__SYCL_INLINE_VER_NAMESPACE(_V1) {
+namespace detail {
+
+class jit_compiler {
+
+public:
+  std::unique_ptr<detail::CG>
+  fuseKernels(QueueImplPtr Queue, std::vector<ExecCGCommand *> &InputKernels,
+              const property_list &);
+
+  static jit_compiler &get_instance() {
+    static jit_compiler instance{};
+    return instance;
+  }
+
+private:
+  jit_compiler();
+  ~jit_compiler();
+  jit_compiler(const jit_compiler &) = delete;
+  jit_compiler(jit_compiler &&) = delete;
+  jit_compiler &operator=(const jit_compiler &) = delete;
+  jit_compiler &operator=(const jit_compiler &&) = delete;
+
+  pi_device_binaries
+  createPIDeviceBinary(const ::jit_compiler::SYCLKernelInfo &FusedKernelInfo);
+
+  std::vector<uint8_t>
+  encodeArgUsageMask(const ::jit_compiler::ArgUsageMask &Mask) const;
+
+  // Manages the lifetime of the PI structs for device binaries.
+  std::vector<DeviceBinariesCollection> JITDeviceBinaries;
+
+  std::unique_ptr<::jit_compiler::JITContext> MJITContext;
+};
+
+} // namespace detail
+} // __SYCL_INLINE_VER_NAMESPACE(_V1)
+} // namespace sycl
diff --git a/sycl/source/detail/jit_device_binaries.cpp b/sycl/source/detail/jit_device_binaries.cpp
new file mode 100644
index 000000000000..0aa778da1424
--- /dev/null
+++ b/sycl/source/detail/jit_device_binaries.cpp
@@ -0,0 +1,138 @@
+//==- jit_device_binaries.cpp - Runtime construction of PI device binaries -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <detail/jit_device_binaries.hpp>
+
+namespace sycl {
+__SYCL_INLINE_VER_NAMESPACE(_V1) {
+namespace detail {
+
+OffloadEntryContainer::OffloadEntryContainer(const std::string &Name,
+                                             void *Addr, size_t Size,
+                                             int32_t Flags, int32_t Reserved)
+    : KernelName{new char[Name.length() + 1]}, Address{Addr}, EntrySize{Size},
+      EntryFlags{Flags}, EntryReserved{Reserved} {
+  std::memcpy(KernelName.get(), Name.c_str(), Name.length() + 1);
+}
+
+_pi_offload_entry_struct OffloadEntryContainer::getPIOffloadEntry() {
+  return _pi_offload_entry_struct{Address, KernelName.get(), EntrySize,
+                                  EntryFlags, EntryReserved};
+}
+
+PropertyContainer::PropertyContainer(const std::string &Name, void *Data,
+                                     size_t Size, uint32_t Type)
+    : PropName{new char[Name.length() + 1]}, Value{new unsigned char[Size]},
+      ValueSize{Size}, PropType{Type} {
+  std::memcpy(PropName.get(), Name.c_str(), Name.length() + 1);
+  std::memcpy(Value.get(), Data, Size);
+}
+
+_pi_device_binary_property_struct PropertyContainer::getPIProperty() {
+  return _pi_device_binary_property_struct{PropName.get(), Value.get(),
+                                           PropType, ValueSize};
+}
+
+PropertySetContainer::PropertySetContainer(const std::string &Name)
+    : SetName{new char[Name.length() + 1]} {
+  std::memcpy(SetName.get(), Name.c_str(), Name.length() + 1);
+}
+
+void PropertySetContainer::addProperty(PropertyContainer &&Prop) {
+  // Adding to the vectors might trigger reallocation, which would invalidate
+  // the pointers used for PI structs if a PI struct has already been created
+  // via getPIPropertySet(). Forbid calls to this method after the first PI
+  // struct has been created.
+  assert(Fused && "Adding to container would invalidate existing PI structs");
+  PIProperties.push_back(Prop.getPIProperty());
+  Properties.push_back(std::move(Prop));
+}
+
+_pi_device_binary_property_set_struct PropertySetContainer::getPIPropertySet() {
+  Fused = false;
+  return _pi_device_binary_property_set_struct{
+      const_cast<char *>(SetName.get()), PIProperties.data(),
+      PIProperties.data() + Properties.size()};
+}
+
+void DeviceBinaryContainer::addOffloadEntry(OffloadEntryContainer &&Cont) {
+  // Adding to the vectors might trigger reallocation, which would invalidate
+  // the pointers used for PI structs if a PI struct has already been created
+  // via getPIDeviceBinary(). Forbid calls to this method after the first PI
+  // struct has been created.
+  assert(Fused && "Adding to container would invalidate existing PI structs");
+  PIOffloadEntries.push_back(Cont.getPIOffloadEntry());
+  OffloadEntries.push_back(std::move(Cont));
+}
+
+void DeviceBinaryContainer::addProperty(PropertySetContainer &&Cont) {
+  // Adding to the vectors might trigger reallocation, which would invalidate
+  // the pointers used for PI structs if a PI struct has already been created
+  // via getPIDeviceBinary(). Forbid calls to this method after the first PI
+  // struct has been created.
+  assert(Fused && "Adding to container would invalidate existing PI structs");
+  PIPropertySets.push_back(Cont.getPIPropertySet());
+  PropertySets.push_back(std::move(Cont));
+}
+
+pi_device_binary_struct DeviceBinaryContainer::getPIDeviceBinary(
+    const unsigned char *BinaryStart, size_t BinarySize, size_t AddressBits) {
+  pi_device_binary_struct DeviceBinary;
+  DeviceBinary.Version = PI_DEVICE_BINARY_VERSION;
+  DeviceBinary.Kind = PI_DEVICE_BINARY_OFFLOAD_KIND_SYCL;
+  DeviceBinary.CompileOptions = "";
+  DeviceBinary.LinkOptions = "";
+  DeviceBinary.ManifestStart = nullptr;
+  DeviceBinary.ManifestEnd = nullptr;
+  // It is safe to use these pointers here, as their lifetime is managed by
+  // the JITContext.
+  DeviceBinary.BinaryStart = BinaryStart;
+  DeviceBinary.BinaryEnd = BinaryStart + BinarySize;
+  DeviceBinary.Format = PI_DEVICE_BINARY_TYPE_SPIRV;
+  DeviceBinary.DeviceTargetSpec = (AddressBits == 32)
+                                      ? __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV32
+                                      : __SYCL_PI_DEVICE_BINARY_TARGET_SPIRV64;
+  DeviceBinary.EntriesBegin = PIOffloadEntries.data();
+  DeviceBinary.EntriesEnd = PIOffloadEntries.data() + PIOffloadEntries.size();
+  DeviceBinary.PropertySetsBegin = PIPropertySets.data();
+  DeviceBinary.PropertySetsEnd = PIPropertySets.data() + PIPropertySets.size();
+  Fused = false;
+  return DeviceBinary;
+}
+
+void DeviceBinariesCollection::addDeviceBinary(DeviceBinaryContainer &&Cont,
+                                               const unsigned char *BinaryStart,
+                                               size_t BinarySize,
+                                               size_t AddressBits) {
+  // Adding to the vectors might trigger reallocation, which would invalidate
+  // the pointers used for PI structs if a PI struct has already been created
+  // via getPIDeviceStruct(). Forbid calls to this method after the first PI
+  // struct has been created.
+  assert(Fused && "Adding to container would invalidate existing PI structs");
+  PIBinaries.push_back(
+      Cont.getPIDeviceBinary(BinaryStart, BinarySize, AddressBits));
+  Binaries.push_back(std::move(Cont));
+}
+
+pi_device_binaries DeviceBinariesCollection::getPIDeviceStruct() {
+
+  PIStruct = std::make_unique<pi_device_binaries_struct>();
+  PIStruct->Version = PI_DEVICE_BINARIES_VERSION;
+  PIStruct->NumDeviceBinaries = PIBinaries.size();
+  PIStruct->DeviceBinaries = PIBinaries.data();
+  // According to documentation in pi.h, the HostEntries are not used and
+  // can therefore be null.
+  PIStruct->HostEntriesBegin = nullptr;
+  PIStruct->HostEntriesEnd = nullptr;
+  Fused = false;
+  return PIStruct.get();
+}
+
+} // namespace detail
+} // __SYCL_INLINE_VER_NAMESPACE(_V1)
+} // namespace sycl
diff --git a/sycl/source/detail/jit_device_binaries.hpp b/sycl/source/detail/jit_device_binaries.hpp
new file mode 100644
index 000000000000..7bf2c7d9fe07
--- /dev/null
+++ b/sycl/source/detail/jit_device_binaries.hpp
@@ -0,0 +1,154 @@
+//==- jit_device_binaries.hpp - Runtime construction of PI device binaries -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstring>
+#include <memory>
+#include <sycl/detail/pi.hpp>
+
+namespace sycl {
+__SYCL_INLINE_VER_NAMESPACE(_V1) {
+namespace detail {
+
+/// Representation of _pi_offload_entry for creation of JIT device binaries at
+/// runtime.
+/// Owns the necessary data and provides raw pointers for the PI struct.
+class OffloadEntryContainer {
+public:
+  OffloadEntryContainer(const std::string &Name, void *Addr, size_t Size,
+                        int32_t Flags, int32_t Reserved);
+
+  OffloadEntryContainer(OffloadEntryContainer &&) = default;
+  OffloadEntryContainer &operator=(OffloadEntryContainer &&) = default;
+  ~OffloadEntryContainer() = default;
+  // Copying of the container is not allowed.
+  OffloadEntryContainer(const OffloadEntryContainer &) = delete;
+  OffloadEntryContainer &operator=(const OffloadEntryContainer &) = delete;
+
+  _pi_offload_entry_struct getPIOffloadEntry();
+
+private:
+  std::unique_ptr<char[]> KernelName;
+
+  void *Address;
+  size_t EntrySize;
+  int32_t EntryFlags;
+  int32_t EntryReserved;
+};
+
+/// Representation of _pi_device_binary_property_struct for creation of JIT
+/// device binaries at runtime.
+/// Owns the necessary data and provides raw pointers for the PI struct.
+class PropertyContainer {
+
+public:
+  PropertyContainer(const std::string &Name, void *Data, size_t Size,
+                    uint32_t Type);
+
+  PropertyContainer(PropertyContainer &&) = default;
+  PropertyContainer &operator=(PropertyContainer &&) = default;
+  ~PropertyContainer() = default;
+  // Copying of the container is not allowed.
+  PropertyContainer(const PropertyContainer &) = delete;
+  PropertyContainer &operator=(const PropertyContainer &) = delete;
+
+  _pi_device_binary_property_struct getPIProperty();
+
+private:
+  std::unique_ptr<char[]> PropName;
+  std::unique_ptr<unsigned char[]> Value;
+  size_t ValueSize;
+  uint32_t PropType;
+};
+
+/// Representation of _pi_device_binary_property_set_struct for creation of JIT
+/// device binaries at runtime.
+/// Owns the necessary data and provides raw pointers for the PI struct.
+class PropertySetContainer {
+public:
+  PropertySetContainer(const std::string &Name);
+
+  PropertySetContainer(PropertySetContainer &&) = default;
+  PropertySetContainer &operator=(PropertySetContainer &&) = default;
+  ~PropertySetContainer() = default;
+  // Copying of the container is not allowed, as it would invalidate PI structs.
+  PropertySetContainer(const PropertySetContainer &) = delete;
+  PropertySetContainer &operator=(const PropertySetContainer &) = delete;
+
+  void addProperty(PropertyContainer &&Prop);
+
+  _pi_device_binary_property_set_struct getPIPropertySet();
+
+private:
+  std::unique_ptr<char[]> SetName;
+  bool Fused = true;
+  std::vector<PropertyContainer> Properties;
+  std::vector<_pi_device_binary_property_struct> PIProperties;
+};
+
+/// Representation of pi_device_binary_struct for creation of JIT device
+/// binaries at runtime.
+/// Owns the necessary data and provides raw pointers for the PI struct.
+class DeviceBinaryContainer {
+public:
+  DeviceBinaryContainer() = default;
+  DeviceBinaryContainer(DeviceBinaryContainer &&) = default;
+  DeviceBinaryContainer &operator=(DeviceBinaryContainer &&) = default;
+  ~DeviceBinaryContainer() = default;
+  // Copying of the container is not allowed, as it would invalidate PI structs.
+  DeviceBinaryContainer(const DeviceBinaryContainer &) = delete;
+  DeviceBinaryContainer &operator=(const DeviceBinaryContainer &) = delete;
+
+  void addOffloadEntry(OffloadEntryContainer &&Cont);
+
+  void addProperty(PropertySetContainer &&Cont);
+
+  pi_device_binary_struct getPIDeviceBinary(const unsigned char *BinaryStart,
+                                            size_t BinarySize,
+                                            size_t AddressBits);
+
+private:
+  bool Fused = true;
+  std::vector<OffloadEntryContainer> OffloadEntries;
+  std::vector<_pi_offload_entry_struct> PIOffloadEntries;
+  std::vector<PropertySetContainer> PropertySets;
+  std::vector<_pi_device_binary_property_set_struct> PIPropertySets;
+};
+
+/// Representation of pi_device_binaries_struct for creation of JIT device
+/// binaries at runtime.
+/// Owns the necessary data and provides raw pointers for the PI struct.
+class DeviceBinariesCollection {
+
+public:
+  DeviceBinariesCollection() = default;
+  DeviceBinariesCollection(DeviceBinariesCollection &&) = default;
+  DeviceBinariesCollection &operator=(DeviceBinariesCollection &&) = default;
+  ~DeviceBinariesCollection() = default;
+  // Copying of the container is not allowed.
+  DeviceBinariesCollection(const DeviceBinariesCollection &) = delete;
+  DeviceBinariesCollection &
+  operator=(const DeviceBinariesCollection &) = delete;
+
+  void addDeviceBinary(DeviceBinaryContainer &&Cont,
+                       const unsigned char *BinaryStart, size_t BinarySize,
+                       size_t AddressBits);
+  pi_device_binaries getPIDeviceStruct();
+
+private:
+  bool Fused = true;
+  std::unique_ptr<pi_device_binaries_struct> PIStruct;
+
+  std::vector<DeviceBinaryContainer> Binaries;
+  std::vector<pi_device_binary_struct> PIBinaries;
+};
+
+} // namespace detail
+} // __SYCL_INLINE_VER_NAMESPACE(_V1)
+} // namespace sycl
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index 9e919aa78614..5f27ca5c0255 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -9,6 +9,10 @@
 #include "detail/config.hpp"
 #include <detail/context_impl.hpp>
 #include <detail/event_impl.hpp>
+#include <sycl/feature_test.hpp>
+#if SYCL_EXT_CODEPLAY_KERNEL_FUSION
+#include <detail/jit_compiler.hpp>
+#endif
 #include <detail/memory_manager.hpp>
 #include <detail/queue_impl.hpp>
 #include <detail/scheduler/scheduler.hpp>
@@ -928,7 +932,7 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
                                const QueueImplPtr &Queue,
                                std::vector<Command *> &ToEnqueue) {
   std::vector<Requirement *> &Reqs = CommandGroup->MRequirements;
-  const std::vector<detail::EventImplPtr> &Events = CommandGroup->MEvents;
+  std::vector<detail::EventImplPtr> &Events = CommandGroup->MEvents;
 
   auto NewCmd = std::make_unique<ExecCGCommand>(std::move(CommandGroup), Queue);
   if (!NewCmd)
@@ -941,7 +945,40 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
   auto QUniqueID = std::hash<QueueImplPtr>()(Queue);
   if (isInFusionMode(QUniqueID) && !NewCmd->isHostTask()) {
     auto *FusionCmd = findFusionList(QUniqueID)->second.get();
-    FusionCmd->addToFusionList(NewCmd.get());
+
+    bool dependsOnFusion = false;
+    for (auto Ev = Events.begin(); Ev != Events.end();) {
+      auto *EvDepCmd = static_cast<Command *>((*Ev)->getCommand());
+      if (!EvDepCmd) {
+        continue;
+      }
+      // Handle event dependencies on any commands part of another active
+      // fusion.
+      if (EvDepCmd->getQueue() != Queue && isPartOfActiveFusion(EvDepCmd)) {
+        printFusionWarning("Aborting fusion because of event dependency from a "
+                           "different fusion");
+        cancelFusion(EvDepCmd->getQueue(), ToEnqueue);
+      }
+      // Check if this command depends on the placeholder command for the fusion
+      // itself participates in.
+      if (EvDepCmd == FusionCmd) {
+        Ev = Events.erase(Ev);
+        dependsOnFusion = true;
+      } else {
+        ++Ev;
+      }
+    }
+
+    // If this command has an explicit event dependency on the placeholder
+    // command for this fusion (because it used depends_on on the event returned
+    // by submitting another kernel to this fusion earlier), add a dependency on
+    // all the commands in the fusion list so far.
+    if (dependsOnFusion) {
+      for (auto *Cmd : FusionCmd->getFusionList()) {
+        Events.push_back(Cmd->getEvent());
+      }
+    }
+
     // Add the kernel to the graph, but delay the enqueue of any auxiliary
     // commands (e.g., allocations) resulting from that process by adding them
     // to the list of auxiliary commands of the fusion command.
@@ -964,21 +1001,10 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
         cancelFusion(DepCmd->getQueue(), ToEnqueue);
       }
     }
-    // Handle event dependencies on any commands part of another active fusion.
-    for (auto &Ev : Events) {
-      auto *EvDepCmd = static_cast<Command *>(Ev->getCommand());
-      if (!EvDepCmd) {
-        continue;
-      }
-      if (EvDepCmd->getQueue() != Queue && isPartOfActiveFusion(EvDepCmd)) {
-        printFusionWarning("Aborting fusion because of event dependency from a "
-                           "different fusion");
-        cancelFusion(EvDepCmd->getQueue(), ToEnqueue);
-      }
-    }
 
     // Set the fusion command, so we recognize when another command depends on a
     // kernel in the fusion list.
+    FusionCmd->addToFusionList(NewCmd.get());
     NewCmd->MFusionCmd = FusionCmd;
     std::vector<Command *> ToCleanUp;
     // Add an event dependency from the fusion placeholder command to the new
@@ -1427,7 +1453,8 @@ Scheduler::GraphBuilder::completeFusion(QueueImplPtr Queue,
 
   // TODO: The logic to invoke the JIT compiler to create a fused kernel from
   // the list will be added in a later PR.
-  auto FusedCG = nullptr;
+  auto FusedCG = detail::jit_compiler::get_instance().fuseKernels(
+      Queue, CmdList, PropList);
 
   if (!FusedCG) {
     // If the JIT compiler returns a nullptr, JIT compilation of the fused
diff --git a/sycl/test/CMakeLists.txt b/sycl/test/CMakeLists.txt
index 0fcad85f2060..3257fe7d261e 100644
--- a/sycl/test/CMakeLists.txt
+++ b/sycl/test/CMakeLists.txt
@@ -120,3 +120,7 @@ if(SYCL_BUILD_PI_HIP)
 
   add_dependencies(check-sycl check-sycl-hip)
 endif()
+
+if(SYCL_ENABLE_KERNEL_FUSION)
+  add_dependencies(check-sycl check-sycl-fusion)
+endif(SYCL_ENABLE_KERNEL_FUSION)