diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp
index b7af801f2e141..4edc5f05f79ee 100644
--- a/sycl/include/sycl/handler.hpp
+++ b/sycl/include/sycl/handler.hpp
@@ -79,6 +79,10 @@ __SYCL_INLINE_VER_NAMESPACE(_V1) {
 class handler;
 template <typename T, int Dimensions, typename AllocatorT, typename Enable>
 class buffer;
+
+namespace ext::oneapi::experimental::detail {
+class graph_impl;
+}
 namespace detail {
 
 class handler_impl;
@@ -372,6 +376,14 @@ class __SYCL_EXPORT handler {
           std::shared_ptr<detail::queue_impl> PrimaryQueue,
           std::shared_ptr<detail::queue_impl> SecondaryQueue, bool IsHost);
 
+  /// Constructs SYCL handler from Graph.
+  ///
+  /// The hander will add the command-group as a node to the graph rather than
+  /// enqueueing it straight away.
+  ///
+  /// \param Graph is a SYCL command_graph
+  handler(std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph);
+
   /// Stores copy of Arg passed to the MArgsStorage.
   template <typename T, typename F = typename detail::remove_const_t<
                             typename detail::remove_reference_t<T>>>
@@ -2528,6 +2540,8 @@ class __SYCL_EXPORT handler {
 private:
   std::shared_ptr<detail::handler_impl> MImpl;
   std::shared_ptr<detail::queue_impl> MQueue;
+  std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> MGraph;
+
   /// The storage for the arguments passed.
   /// We need to store a copy of values that are passed explicitly through
   /// set_arg, require and so on, because we need them to be alive after
@@ -2611,6 +2625,8 @@ class __SYCL_EXPORT handler {
   friend class ::MockHandler;
   friend class detail::queue_impl;
 
+  friend class ext::oneapi::experimental::detail::graph_impl;
+
   bool DisableRangeRounding();
 
   bool RangeRoundingTrace();
diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp
index 1af071d84ef0a..337486d6096fc 100644
--- a/sycl/source/detail/graph_impl.cpp
+++ b/sycl/source/detail/graph_impl.cpp
@@ -8,6 +8,7 @@
 
 #include <detail/graph_impl.hpp>
 #include <detail/queue_impl.hpp>
+#include <detail/scheduler/commands.hpp>
 #include <sycl/queue.hpp>
 
 namespace sycl {
@@ -61,76 +62,94 @@ void graph_impl::remove_root(const std::shared_ptr<node_impl> &Root) {
 // Recursive check if a graph node or its successors contains a given kernel
 // argument.
 //
-// @param[in] arg The kernel argument to check for.
-// @param[in] currentNode The current graph node being checked.
-// @param[in,out] deps The unique list of dependencies which have been
+// @param[in] Arg The kernel argument to check for.
+// @param[in] CurrentNode The current graph node being checked.
+// @param[in,out] Deps The unique list of dependencies which have been
 // identified for this arg.
-// @param[in] dereferencePtr if true arg comes direct from the handler in which
-// case it will need to be deferenced to check actual value.
 //
 // @returns True if a dependency was added in this node of any of its
 // successors.
 bool check_for_arg(const sycl::detail::ArgDesc &Arg,
                    const std::shared_ptr<node_impl> &CurrentNode,
-                   std::set<std::shared_ptr<node_impl>> &Deps,
-                   bool DereferencePtr = false) {
+                   std::set<std::shared_ptr<node_impl>> &Deps) {
   bool SuccessorAddedDep = false;
   for (auto &Successor : CurrentNode->MSuccessors) {
-    SuccessorAddedDep |= check_for_arg(Arg, Successor, Deps, DereferencePtr);
+    SuccessorAddedDep |= check_for_arg(Arg, Successor, Deps);
   }
 
-  if (Deps.find(CurrentNode) == Deps.end() &&
-      CurrentNode->has_arg(Arg, DereferencePtr) && !SuccessorAddedDep) {
+  if (Deps.find(CurrentNode) == Deps.end() && CurrentNode->has_arg(Arg) &&
+      !SuccessorAddedDep) {
     Deps.insert(CurrentNode);
     return true;
   }
   return SuccessorAddedDep;
 }
 
-template <typename T>
 std::shared_ptr<node_impl>
-graph_impl::add(const std::shared_ptr<graph_impl> &impl, T cgf,
-                const std::vector<sycl::detail::ArgDesc> &args,
-                const std::vector<std::shared_ptr<node_impl>> &dep) {
-  std::shared_ptr<node_impl> nodeImpl =
-      std::make_shared<node_impl>(impl, cgf, args);
+graph_impl::add(const std::shared_ptr<graph_impl> &Impl,
+                std::function<void(handler &)> CGF,
+                const std::vector<sycl::detail::ArgDesc> &Args,
+                const std::vector<std::shared_ptr<node_impl>> &Dep) {
+  sycl::handler Handler{Impl};
+  CGF(Handler);
+
+  return this->add(Impl, Handler.MKernel, Handler.MNDRDesc,
+                   Handler.MOSModuleHandle, Handler.MKernelName,
+                   Handler.MAccStorage, Handler.MLocalAccStorage,
+                   Handler.MRequirements, Handler.MArgs, {});
+}
+
+std::shared_ptr<node_impl> graph_impl::add(
+    const std::shared_ptr<graph_impl> &Impl,
+    std::shared_ptr<sycl::detail::kernel_impl> Kernel,
+    sycl::detail::NDRDescT NDRDesc, sycl::detail::OSModuleHandle OSModuleHandle,
+    std::string KernelName,
+    const std::vector<sycl::detail::AccessorImplPtr> &AccStorage,
+    const std::vector<sycl::detail::LocalAccessorImplPtr> &LocalAccStorage,
+    const std::vector<sycl::detail::AccessorImplHost *> &Requirements,
+    const std::vector<sycl::detail::ArgDesc> &Args,
+    const std::vector<std::shared_ptr<node_impl>> &Dep) {
+  const std::shared_ptr<node_impl> &NodeImpl = std::make_shared<node_impl>(
+      Impl, Kernel, NDRDesc, OSModuleHandle, KernelName, AccStorage,
+      LocalAccStorage, Requirements, Args);
   // Copy deps so we can modify them
-  auto deps = dep;
+  auto Deps = Dep;
   // A unique set of dependencies obtained by checking kernel arguments
-  std::set<std::shared_ptr<node_impl>> uniqueDeps;
-  for (auto &arg : args) {
-    if (arg.MType != sycl::detail::kernel_param_kind_t::kind_pointer) {
+  // for accessors
+  std::set<std::shared_ptr<node_impl>> UniqueDeps;
+  for (auto &Arg : Args) {
+    if (Arg.MType != sycl::detail::kernel_param_kind_t::kind_accessor) {
       continue;
     }
     // Look through the graph for nodes which share this argument
-    for (auto nodePtr : MRoots) {
-      check_for_arg(arg, nodePtr, uniqueDeps, true);
+    for (auto NodePtr : MRoots) {
+      check_for_arg(Arg, NodePtr, UniqueDeps);
     }
   }
 
-  // Add any deps determined from arguments into the dependency list
-  deps.insert(deps.end(), uniqueDeps.begin(), uniqueDeps.end());
-  if (!deps.empty()) {
-    for (auto n : deps) {
-      n->register_successor(nodeImpl); // register successor
-      this->remove_root(nodeImpl);     // remove receiver from root node
+  // Add any deps determined from accessor arguments into the dependency list
+  Deps.insert(Deps.end(), UniqueDeps.begin(), UniqueDeps.end());
+  if (!Deps.empty()) {
+    for (auto N : Deps) {
+      N->register_successor(NodeImpl); // register successor
+      this->remove_root(NodeImpl);     // remove receiver from root node
                                        // list
     }
   } else {
-    this->add_root(nodeImpl);
+    this->add_root(NodeImpl);
   }
-  return nodeImpl;
+  return NodeImpl;
 }
 
 bool graph_impl::clear_queues() {
-  bool anyQueuesCleared = false;
-  for (auto &q : MRecordingQueues) {
-    q->setCommandGraph(nullptr);
-    anyQueuesCleared = true;
+  bool AnyQueuesCleared = false;
+  for (auto &Queue : MRecordingQueues) {
+    Queue->setCommandGraph(nullptr);
+    AnyQueuesCleared = true;
   }
   MRecordingQueues.clear();
 
-  return anyQueuesCleared;
+  return AnyQueuesCleared;
 }
 
 void node_impl::exec(const std::shared_ptr<sycl::detail::queue_impl> &Queue
@@ -139,7 +158,33 @@ void node_impl::exec(const std::shared_ptr<sycl::detail::queue_impl> &Queue
   for (auto Sender : MPredecessors)
     Deps.push_back(Sender->get_event());
 
-  MEvent = Queue->submit(wrapper{MBody, Deps}, Queue _CODELOCFW(CodeLoc));
+  // Enqueue kernel here instead of submit
+
+  std::vector<pi_event> RawEvents;
+  pi_event *OutEvent = nullptr;
+  auto NewEvent = std::make_shared<sycl::detail::event_impl>(Queue);
+  NewEvent->setContextImpl(Queue->getContextImplPtr());
+  NewEvent->setStateIncomplete();
+  OutEvent = &NewEvent->getHandleRef();
+  pi_result Res =
+      Queue->getPlugin().call_nocheck<sycl::detail::PiApiKind::piEventCreate>(
+          sycl::detail::getSyclObjImpl(Queue->get_context())->getHandleRef(),
+          OutEvent);
+  if (Res != pi_result::PI_SUCCESS) {
+    throw sycl::exception(errc::event,
+                          "Failed to create event for node submission");
+  }
+
+  pi_int32 Result = enqueueImpKernel(
+      Queue, MNDRDesc, MArgs, /* KernelBundleImpPtr */ nullptr, MKernel,
+      MKernelName, MOSModuleHandle, RawEvents, OutEvent, nullptr);
+  if (Result != pi_result::PI_SUCCESS) {
+    throw sycl::exception(errc::kernel, "Error enqueuing graph node kernel");
+  }
+  sycl::event QueueEvent =
+      sycl::detail::createSyclObjFromImpl<sycl::event>(NewEvent);
+  Queue->addEvent(QueueEvent);
+  MEvent = QueueEvent;
 }
 } // namespace detail
 
diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp
index a35b9858fa228..f913cae633954 100644
--- a/sycl/source/detail/graph_impl.hpp
+++ b/sycl/source/detail/graph_impl.hpp
@@ -9,9 +9,13 @@
 #pragma once
 
 #include <sycl/detail/cg_types.hpp>
+#include <sycl/detail/os_util.hpp>
 #include <sycl/ext/oneapi/experimental/graph.hpp>
 #include <sycl/handler.hpp>
 
+#include <detail/kernel_impl.hpp>
+
+#include <cstring>
 #include <functional>
 #include <list>
 #include <set>
@@ -48,9 +52,24 @@ struct node_impl {
   std::vector<std::shared_ptr<node_impl>> MSuccessors;
   std::vector<std::shared_ptr<node_impl>> MPredecessors;
 
-  std::function<void(sycl::handler &)> MBody;
-
+  /// Kernel to be executed by this node
+  std::shared_ptr<sycl::detail::kernel_impl> MKernel;
+  /// Description of the kernel global and local sizes as well as offset
+  sycl::detail::NDRDescT MNDRDesc;
+  /// Module handle for the kernel to be executed.
+  sycl::detail::OSModuleHandle MOSModuleHandle =
+      sycl::detail::OSUtil::ExeModuleHandle;
+  /// Kernel name inside the module
+  std::string MKernelName;
+  std::vector<sycl::detail::AccessorImplPtr> MAccStorage;
+  std::vector<sycl::detail::LocalAccessorImplPtr> MLocalAccStorage;
+  std::vector<sycl::detail::AccessorImplHost *> MRequirements;
+
+  /// Store arg descriptors for the kernel arguments
   std::vector<sycl::detail::ArgDesc> MArgs;
+  // We need to store local copies of the values pointed to by MArgs since they
+  // may go out of scope before execution.
+  std::vector<std::vector<std::byte>> MArgStorage;
 
   void exec(const std::shared_ptr<sycl::detail::queue_impl> &Queue
                 _CODELOCPARAM(&CodeLoc));
@@ -66,17 +85,30 @@ struct node_impl {
 
   sycl::event get_event(void) const { return MEvent; }
 
-  template <typename T>
-  node_impl(const std::shared_ptr<graph_impl> &Graph, T CGF,
-            const std::vector<sycl::detail::ArgDesc> &Args)
-      : MScheduled(false), MGraph(Graph), MBody(CGF), MArgs(Args) {
+  node_impl(
+      const std::shared_ptr<graph_impl> &Graph,
+      std::shared_ptr<sycl::detail::kernel_impl> Kernel,
+      sycl::detail::NDRDescT NDRDesc,
+      sycl::detail::OSModuleHandle OSModuleHandle, std::string KernelName,
+      const std::vector<sycl::detail::AccessorImplPtr> &AccStorage,
+      const std::vector<sycl::detail::LocalAccessorImplPtr> &LocalAccStorage,
+      const std::vector<sycl::detail::AccessorImplHost *> &Requirements,
+      const std::vector<sycl::detail::ArgDesc> &args)
+      : MScheduled(false), MGraph(Graph), MKernel(Kernel), MNDRDesc(NDRDesc),
+        MOSModuleHandle(OSModuleHandle), MKernelName(KernelName),
+        MAccStorage(AccStorage), MLocalAccStorage(LocalAccStorage),
+        MRequirements(Requirements), MArgs(args), MArgStorage() {
+
+    // Need to copy the arg values to node local storage so that they don't go
+    // out of scope before execution
     for (size_t i = 0; i < MArgs.size(); i++) {
-      if (MArgs[i].MType == sycl::detail::kernel_param_kind_t::kind_pointer) {
-        // Make sure we are storing the actual USM pointer for comparison
-        // purposes, note we couldn't actually submit using these copies of the
-        // args if subsequent code expects a void**.
-        MArgs[i].MPtr = *(void **)(MArgs[i].MPtr);
-      }
+      auto &CurrentArg = MArgs[i];
+      MArgStorage.emplace_back(CurrentArg.MSize);
+      auto StoragePtr = MArgStorage.back().data();
+      if (CurrentArg.MPtr)
+        std::memcpy(StoragePtr, CurrentArg.MPtr, CurrentArg.MSize);
+      // Set the arg descriptor to point to the new storage
+      CurrentArg.MPtr = StoragePtr;
     }
   }
 
@@ -90,13 +122,14 @@ struct node_impl {
     Schedule.push_front(std::shared_ptr<node_impl>(this));
   }
 
-  bool has_arg(const sycl::detail::ArgDesc &Arg, bool DereferencePtr = false) {
+  bool has_arg(const sycl::detail::ArgDesc &Arg) {
     for (auto &NodeArg : MArgs) {
       if (Arg.MType == NodeArg.MType && Arg.MSize == NodeArg.MSize) {
-        // Args coming directly from the handler will need to be dereferenced
-        // since they are actually void**
-        void *IncomingPtr = DereferencePtr ? *(void **)Arg.MPtr : Arg.MPtr;
-        if (IncomingPtr == NodeArg.MPtr) {
+        // Args are actually void** so we need to dereference them to compare
+        // actual values
+        void *IncomingPtr = *static_cast<void **>(Arg.MPtr);
+        void *ArgPtr = *static_cast<void **>(NodeArg.MPtr);
+        if (IncomingPtr == ArgPtr) {
           return true;
         }
       }
@@ -119,9 +152,20 @@ struct graph_impl {
   void add_root(const std::shared_ptr<node_impl> &);
   void remove_root(const std::shared_ptr<node_impl> &);
 
-  template <typename T>
   std::shared_ptr<node_impl>
-  add(const std::shared_ptr<graph_impl> &Impl, T CGF,
+  add(const std::shared_ptr<graph_impl> &Impl,
+      std::shared_ptr<sycl::detail::kernel_impl> Kernel,
+      sycl::detail::NDRDescT NDRDesc,
+      sycl::detail::OSModuleHandle OSModuleHandle, std::string KernelName,
+      const std::vector<sycl::detail::AccessorImplPtr> &AccStorage,
+      const std::vector<sycl::detail::LocalAccessorImplPtr> &LocalAccStorage,
+      const std::vector<sycl::detail::AccessorImplHost *> &Requirements,
+      const std::vector<sycl::detail::ArgDesc> &Args,
+      const std::vector<std::shared_ptr<node_impl>> &Dep = {});
+
+  std::shared_ptr<node_impl>
+  add(const std::shared_ptr<graph_impl> &Impl,
+      std::function<void(handler &)> CGF,
       const std::vector<sycl::detail::ArgDesc> &Args,
       const std::vector<std::shared_ptr<node_impl>> &Dep = {});
 
diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp
index baa9276fe4069..5f883b224f206 100644
--- a/sycl/source/detail/handler_impl.hpp
+++ b/sycl/source/detail/handler_impl.hpp
@@ -29,6 +29,8 @@ class handler_impl {
       : MSubmissionPrimaryQueue(std::move(SubmissionPrimaryQueue)),
         MSubmissionSecondaryQueue(std::move(SubmissionSecondaryQueue)){};
 
+  handler_impl() = default;
+
   void setStateExplicitKernelBundle() {
     if (MSubmissionState == HandlerSubmissionState::SPEC_CONST_SET_STATE)
       throw sycl::exception(
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index 0c34a24275346..ba67dcfe482d9 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -550,34 +550,27 @@ class queue_impl {
     handler Handler(Self, PrimaryQueue, SecondaryQueue, MHostQueue);
     Handler.saveCodeLoc(Loc);
     CGF(Handler);
-    if (auto graphImpl = Self->getCommandGraph(); graphImpl != nullptr) {
-      // Pass the args obtained by the handler to the graph to use in
-      // determining edges between this node and previously submitted nodes.
-      graphImpl->add(graphImpl, CGF, Handler.MArgs, {});
-    } else {
 
-      // Scheduler will later omit events, that are not required to execute
-      // tasks. Host and interop tasks, however, are not submitted to low-level
-      // runtimes and require separate dependency management.
-      const CG::CGTYPE Type = Handler.getType();
+    // Scheduler will later omit events, that are not required to execute
+    // tasks. Host and interop tasks, however, are not submitted to low-level
+    // runtimes and require separate dependency management.
+    const CG::CGTYPE Type = Handler.getType();
 
-      if (PostProcess) {
-        bool IsKernel = Type == CG::Kernel;
-        bool KernelUsesAssert = false;
+    if (PostProcess) {
+      bool IsKernel = Type == CG::Kernel;
+      bool KernelUsesAssert = false;
 
-        if (IsKernel)
-          // Kernel only uses assert if it's non interop one
-          KernelUsesAssert =
-              !(Handler.MKernel && Handler.MKernel->isInterop()) &&
-              ProgramManager::getInstance().kernelUsesAssert(
-                  Handler.MOSModuleHandle, Handler.MKernelName);
+      if (IsKernel)
+        // Kernel only uses assert if it's non interop one
+        KernelUsesAssert = !(Handler.MKernel && Handler.MKernel->isInterop()) &&
+                           ProgramManager::getInstance().kernelUsesAssert(
+                               Handler.MOSModuleHandle, Handler.MKernelName);
 
-        finalizeHandler(Handler, Type, Event);
+      finalizeHandler(Handler, Type, Event);
 
-        (*PostProcess)(IsKernel, KernelUsesAssert, Event);
-      } else
-        finalizeHandler(Handler, Type, Event);
-    }
+      (*PostProcess)(IsKernel, KernelUsesAssert, Event);
+    } else
+      finalizeHandler(Handler, Type, Event);
 
     addEvent(Event);
     return Event;
@@ -664,6 +657,8 @@ class queue_impl {
   // commands to this queue. Used by subgraphs to determine if they are part of
   // a larger command graph submission.
   bool MIsGraphSubmitting = false;
+
+  friend class sycl::ext::oneapi::experimental::detail::node_impl;
 };
 
 } // namespace detail
diff --git a/sycl/source/detail/reduction.cpp b/sycl/source/detail/reduction.cpp
index 5134f9b51996c..47ba843243cda 100644
--- a/sycl/source/detail/reduction.cpp
+++ b/sycl/source/detail/reduction.cpp
@@ -52,6 +52,16 @@ __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
 // with the given queue.
 __SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(
     std::shared_ptr<sycl::detail::queue_impl> Queue) {
+  // TODO: Graphs extension explicit API uses a handler with no queue attached,
+  // so return some value here. In the future we should have access to the
+  // device so can remove this.
+  //
+  // The 8 value was chosen as the hardcoded value as it is the returned
+  // value for sycl::info::device::max_compute_units on
+  // Intel HD Graphics devices used as a L0 backend during development.
+  if (Queue == nullptr) {
+    return 8;
+  }
   device Dev = Queue->get_device();
   uint32_t NumThreads = Dev.get_info<sycl::info::device::max_compute_units>();
   // TODO: The heuristics here require additional tuning for various devices
@@ -104,6 +114,16 @@ reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,
 
 __SYCL_EXPORT size_t reduGetPreferredWGSize(std::shared_ptr<queue_impl> &Queue,
                                             size_t LocalMemBytesPerWorkItem) {
+  // TODO: Graphs extension explicit API uses a handler with a null queue to
+  // process CGFs, in future we should have access to the device so we can
+  // correctly calculate this.
+  //
+  // The 32 value was chosen as the hardcoded value as it is the returned
+  // value for SYCL_REDUCTION_PREFERRED_WORKGROUP_SIZE on
+  // Intel HD Graphics devices used as a L0 backend during development.
+  if (Queue == nullptr) {
+    return 32;
+  }
   device Dev = Queue->get_device();
 
   // The maximum WGSize returned by CPU devices is very large and does not
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index fbd42f6c2563d..e0546e5066453 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -40,6 +40,10 @@ handler::handler(std::shared_ptr<detail::queue_impl> Queue,
                                                    std::move(SecondaryQueue))),
       MQueue(std::move(Queue)), MIsHost(IsHost) {}
 
+handler::handler(
+    std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph)
+    : MImpl(std::make_shared<detail::handler_impl>()), MGraph(Graph) {}
+
 // Sets the submission state to indicate that an explicit kernel bundle has been
 // set. Throws a sycl::exception with errc::invalid if the current state
 // indicates that a specialization constant has been set.
@@ -93,6 +97,14 @@ event handler::finalize() {
   if (MIsFinalized)
     return MLastEvent;
   MIsFinalized = true;
+  if (auto GraphImpl = MQueue->getCommandGraph(); GraphImpl != nullptr) {
+    // Extract relevant data from the handler and pass to graph to create a new
+    // node representing this command group.
+    GraphImpl->add(GraphImpl, MKernel, MNDRDesc, MOSModuleHandle, MKernelName,
+                   MAccStorage, MLocalAccStorage, MRequirements, MArgs, {});
+    return detail::createSyclObjFromImpl<event>(
+        std::make_shared<detail::event_impl>());
+  }
 
   std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImpPtr = nullptr;
   // If there were uses of set_specialization_constant build the kernel_bundle
diff --git a/sycl/test/graph/graph-explicit-dotp-buffer.cpp b/sycl/test/graph/graph-explicit-dotp-buffer.cpp
new file mode 100644
index 0000000000000..0b795714f98dd
--- /dev/null
+++ b/sycl/test/graph/graph-explicit-dotp-buffer.cpp
@@ -0,0 +1,108 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+#include <sycl/sycl.hpp>
+
+#include <sycl/ext/oneapi/experimental/graph.hpp>
+
+const size_t n = 10;
+
+float host_gold_result() {
+  float alpha = 1.0f;
+  float beta = 2.0f;
+  float gamma = 3.0f;
+
+  float sum = 0.0f;
+
+  for (size_t i = 0; i < n; ++i) {
+    sum += (alpha * 1.0f + beta * 2.0f) * (gamma * 3.0f + beta * 2.0f);
+  }
+
+  return sum;
+}
+
+int main() {
+  float alpha = 1.0f;
+  float beta = 2.0f;
+  float gamma = 3.0f;
+
+  sycl::property_list properties{
+      sycl::property::queue::in_order{},
+      sycl::ext::oneapi::property::queue::lazy_execution{}};
+
+  sycl::queue q{sycl::gpu_selector_v, properties};
+
+  sycl::ext::oneapi::experimental::command_graph g;
+
+  float dotpData = 0.f;
+  std::vector<float> xData(n);
+  std::vector<float> yData(n);
+  std::vector<float> zData(n);
+
+  {
+    sycl::buffer dotpBuf(&dotpData, sycl::range<1>(1));
+
+    sycl::buffer xBuf(xData);
+    sycl::buffer yBuf(yData);
+    sycl::buffer zBuf(zData);
+
+    /* init data on the device */
+    auto n_i = g.add([&](sycl::handler &h) {
+      auto x = xBuf.get_access(h);
+      auto y = yBuf.get_access(h);
+      auto z = zBuf.get_access(h);
+      h.parallel_for(n, [=](sycl::id<1> it) {
+        const size_t i = it[0];
+        x[i] = 1.0f;
+        y[i] = 2.0f;
+        z[i] = 3.0f;
+      });
+    });
+
+    auto node_a = g.add([&](sycl::handler &h) {
+      auto x = xBuf.get_access(h);
+      auto y = yBuf.get_access(h);
+      h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
+        const size_t i = it[0];
+        x[i] = alpha * x[i] + beta * y[i];
+      });
+    });
+
+    auto node_b = g.add([&](sycl::handler &h) {
+      auto y = yBuf.get_access(h);
+      auto z = zBuf.get_access(h);
+      h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
+        const size_t i = it[0];
+        z[i] = gamma * z[i] + beta * y[i];
+      });
+    });
+
+    auto node_c = g.add([&](sycl::handler &h) {
+      auto dotp = dotpBuf.get_access(h);
+      auto x = xBuf.get_access(h);
+      auto z = zBuf.get_access(h);
+#ifdef TEST_GRAPH_REDUCTIONS
+      h.parallel_for(sycl::range<1>{n},
+                     sycl::reduction(dotpBuf, h, 0.0f, std::plus()),
+                     [=](sycl::id<1> it, auto &sum) {
+                       const size_t i = it[0];
+                       sum += x[i] * z[i];
+                     });
+#else
+      h.single_task([=]() {
+        // Doing a manual reduction here because reduction objects cause issues
+        // with graphs.
+        for (size_t j = 0; j < n; j++) {
+          dotp[0] += x[j] * z[j];
+        }
+      });
+#endif
+    });
+
+    auto executable_graph = g.finalize(q.get_context());
+
+    // Using shortcut for executing a graph of commands
+    q.ext_oneapi_graph(executable_graph).wait();
+  }
+
+  assert(dotpData == host_gold_result());
+  return 0;
+}
diff --git a/sycl/test/graph/graph-explicit-subgraph.cpp b/sycl/test/graph/graph-explicit-subgraph.cpp
index 160418a9ae012..6f20c34541a0e 100644
--- a/sycl/test/graph/graph-explicit-subgraph.cpp
+++ b/sycl/test/graph/graph-explicit-subgraph.cpp
@@ -74,12 +74,22 @@ int main() {
 
   auto node_c = g.add(
       [&](sycl::handler &h) {
+#ifdef TEST_GRAPH_REDUCTIONS
         h.parallel_for(sycl::range<1>{n},
                        sycl::reduction(dotp, 0.0f, std::plus()),
                        [=](sycl::id<1> it, auto &sum) {
                          const size_t i = it[0];
                          sum += x[i] * z[i];
                        });
+#else
+        h.single_task([=]() {
+          // Doing a manual reduction here because reduction objects cause
+          // issues with graphs.
+          for (size_t j = 0; j < n; j++) {
+            dotp[0] += x[j] * z[j];
+          }
+        });
+#endif
       },
       {node_sub});
 
diff --git a/sycl/test/graph/graph-record-dotp-buffer.cpp b/sycl/test/graph/graph-record-dotp-buffer.cpp
index 0e49e86cf1cad..75764428e617f 100644
--- a/sycl/test/graph/graph-record-dotp-buffer.cpp
+++ b/sycl/test/graph/graph-record-dotp-buffer.cpp
@@ -83,18 +83,23 @@ int main() {
       auto dotp = dotpBuf.get_access(h);
       auto x = xBuf.get_access(h);
       auto z = zBuf.get_access(h);
-      h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
-        const size_t i = it[0];
+#ifdef TEST_GRAPH_REDUCTIONS
+      h.parallel_for(sycl::range<1>{n},
+                     sycl::reduction(dotpBuf, h, 0.0f, std::plus()),
+                     [=](sycl::id<1> it, auto &sum) {
+                       const size_t i = it[0];
+                       sum += x[i] * z[i];
+                     });
+#else
+      h.single_task([=]() {
         // Doing a manual reduction here because reduction objects cause issues
         // with graphs.
-        if (i == 0) {
-          for (size_t j = 0; j < n; j++) {
-            dotp[0] += x[j] * z[j];
-          }
+        for (size_t j = 0; j < n; j++) {
+          dotp[0] += x[j] * z[j];
         }
       });
+#endif
     });
-
     g.end_recording();
 
     auto exec_graph = g.finalize(q.get_context());
@@ -109,4 +114,4 @@ int main() {
   }
 
   return 0;
-}
\ No newline at end of file
+}
diff --git a/sycl/test/graph/graph-record-dotp.cpp b/sycl/test/graph/graph-record-dotp.cpp
index 538383cebc0de..a7627a449c603 100644
--- a/sycl/test/graph/graph-record-dotp.cpp
+++ b/sycl/test/graph/graph-record-dotp.cpp
@@ -67,16 +67,21 @@ int main() {
   });
 
   q.submit([&](sycl::handler &h) {
-    h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
-      const size_t i = it[0];
+#ifdef TEST_GRAPH_REDUCTIONS
+    h.parallel_for(sycl::range<1>{n}, sycl::reduction(dotp, 0.0f, std::plus()),
+                   [=](sycl::id<1> it, auto &sum) {
+                     const size_t i = it[0];
+                     sum += x[i] * z[i];
+                   });
+#else
+    h.single_task([=]() {
       // Doing a manual reduction here because reduction objects cause issues
       // with graphs.
-      if (i == 0) {
-        for (size_t j = 0; j < n; j++) {
-          dotp[0] += x[j] * z[j];
-        }
+      for (size_t j = 0; j < n; j++) {
+        dotp[0] += x[j] * z[j];
       }
     });
+#endif
   });
 
   g.end_recording();
@@ -99,4 +104,4 @@ int main() {
   std::cout << "done.\n";
 
   return 0;
-}
\ No newline at end of file
+}
diff --git a/sycl/test/graph/graph-record-temp-scope.cpp b/sycl/test/graph/graph-record-temp-scope.cpp
new file mode 100644
index 0000000000000..b4d660ccaec0f
--- /dev/null
+++ b/sycl/test/graph/graph-record-temp-scope.cpp
@@ -0,0 +1,49 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+
+#include <sycl/sycl.hpp>
+
+#include <sycl/ext/oneapi/experimental/graph.hpp>
+
+const size_t n = 10;
+const float expectedValue = 42.0f;
+
+void run_some_kernel(sycl::queue q, float *data) {
+  // data is captured by ref here but will have gone out of scope when the
+  // CGF is later run when the graph is executed.
+  q.submit([&](sycl::handler &h) {
+    h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> idx) {
+      size_t i = idx;
+      data[i] = expectedValue;
+    });
+  });
+}
+
+int main() {
+
+  sycl::property_list properties{
+      sycl::property::queue::in_order(),
+      sycl::ext::oneapi::property::queue::lazy_execution{}};
+
+  sycl::queue q{sycl::default_selector_v, properties};
+
+  sycl::ext::oneapi::experimental::command_graph g;
+
+  float *arr = sycl::malloc_shared<float>(n, q);
+
+  g.begin_recording(q);
+  run_some_kernel(q, arr);
+  g.end_recording(q);
+
+  auto exec_graph = g.finalize(q.get_context());
+
+  q.submit([&](sycl::handler &h) { h.ext_oneapi_graph(exec_graph); });
+
+  // Verify results
+  for (size_t i = 0; i < n; i++) {
+    assert(arr[i] == expectedValue);
+  }
+
+  sycl::free(arr, q.get_context());
+
+  return 0;
+}