From 7c62056b70631eedda99b24c10595d2948ebdf40 Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Fri, 18 Feb 2022 15:16:27 -0600
Subject: [PATCH 01/43] Inital version of sycl graph prototype

---
 .../sycl/ext/oneapi/experimental/graph.hpp    | 212 ++++++++++++++++++
 1 file changed, 212 insertions(+)
 create mode 100644 sycl/include/sycl/ext/oneapi/experimental/graph.hpp
diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
new file mode 100644
index 0000000000000..08a7d094e9054
--- /dev/null
+++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
@@ -0,0 +1,212 @@
+//==--------- graph.hpp --- SYCL graph extension ---------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/detail/defines_elementary.hpp>
+
+#include <set>
+#include <list>
+
+__SYCL_INLINE_NAMESPACE(cl) {
+namespace sycl {
+namespace ext {
+namespace oneapi {
+namespace experimental {
+namespace detail {
+
+struct node_impl;
+
+struct graph_impl;
+
+using node_ptr = std::shared_ptr<node_impl>;
+
+using graph_ptr = std::shared_ptr<graph_impl>;
+
+class wrapper {
+    using T = std::function<void(sycl::handler&)>;
+    T my_func;
+    std::vector<sycl::event> my_deps;
+public:
+    wrapper(T t, const std::vector<sycl::event>& deps) : my_func(t), my_deps(deps) {};
+
+    void operator()(sycl::handler& cgh) {
+        cgh.depends_on(my_deps);
+        std::invoke(my_func,cgh);
+    }
+};
+
+struct node_impl {
+    bool is_scheduled;
+
+    graph_ptr my_graph;
+    sycl::event my_event;
+
+    std::vector<node_ptr> my_successors;
+    std::vector<node_ptr> my_predecessors;
+
+    std::function<void(sycl::handler&)> my_body;
+
+    void exec( sycl::queue q ) {
+        std::vector<sycl::event> __deps;
+        for(auto i:my_predecessors) __deps.push_back(i->get_event());
+        my_event = q.submit(wrapper{my_body,__deps});
+    }
+
+    void register_successor(node_ptr n) {
+        my_successors.push_back(n);
+        n->register_predecessor(node_ptr(this));
+    }
+
+    void register_predecessor(node_ptr n) { my_predecessors.push_back(n); }
+
+    sycl::event get_event(void) {return my_event;}
+
+    template<typename T>
+    node_impl(graph_ptr g, T cgf) : is_scheduled(false), my_graph(g), my_body(cgf) {}
+
+    // Recursively adding nodes to execution stack:
+    void topology_sort(std::list<node_ptr>& schedule) {
+        is_scheduled = true;
+        for(auto i:my_successors) {
+            if(!i->is_scheduled) i->topology_sort(schedule);
+        }
+        schedule.push_front(node_ptr(this));
+    }
+};
+
+struct graph_impl {
+    std::set<node_ptr> my_roots;
+    std::list<node_ptr> my_schedule;
+
+    graph_ptr parent;
+
+    void exec( sycl::queue q ) {
+        if( my_schedule.empty() ) {
+            for(auto n : my_roots) {
+                n->topology_sort(my_schedule);
+            }
+        }
+        for(auto n : my_schedule) n->exec(q);
+    }
+
+    void exec_and_wait( sycl::queue q ) {
+        exec(q);
+        q.wait();
+    }
+
+    void add_root(node_ptr n) {
+        my_roots.insert(n);
+        for(auto n : my_schedule) n->is_scheduled=false;
+        my_schedule.clear();
+    }
+
+    void remove_root(node_ptr n) {
+        my_roots.erase(n);
+        for(auto n : my_schedule) n->is_scheduled=false;
+        my_schedule.clear();
+    }
+
+    graph_impl() {}
+};
+
+} // namespace detail
+
+class node;
+
+class graph;
+
+class executable_graph;
+
+struct node {
+    // TODO: add properties to distinguish between empty, host, device nodes.
+    detail::node_ptr my_node;
+    detail::graph_ptr my_graph;
+
+    template<typename T>
+    node(detail::graph_ptr g, T cgf) : my_graph(g), my_node(new detail::node_impl(g,cgf)) {};
+    void register_successor(node n) { my_node->register_successor(n.my_node); }
+    void exec( sycl::queue q, sycl::event = sycl::event() ) { my_node->exec(q); }
+
+    void set_root() {  my_graph->add_root(my_node);}
+
+    // TODO: Add query functions: is_root, ...
+};
+
+class executable_graph {
+public:
+    int my_tag;
+    sycl::queue my_queue;
+
+    void exec_and_wait();// { my_queue.wait(); }
+
+    executable_graph(detail::graph_ptr g, sycl::queue q) : my_queue(q), my_tag(rand()) {
+      g->exec(my_queue);
+    }
+};
+
+class graph {
+public:
+    // Adding empty node with [0..n] predecessors:
+    node add_empty_node(const std::vector<node>& dep = {});
+
+    // Adding node for host task 
+    template<typename T>
+    node add_host_node(T hostTaskCallable, const std::vector<node>& dep = {});
+
+    // Adding device node:
+    template<typename T>
+    node add_device_node(T cgf, const std::vector<node>& dep = {});
+
+    // Adding dependency between two nodes.
+    void make_edge(node sender, node receiver);
+
+    // TODO: Extend queue to directly submit graph
+    void exec_and_wait( sycl::queue q );
+
+    executable_graph exec( sycl::queue q ) { return executable_graph{my_graph,q};};
+
+    graph() : my_graph(new detail::graph_impl()) {}
+
+    // Creating a subgraph (with predecessors)
+    graph(graph& parent, const std::vector<node>& dep = {}) {}
+
+    bool is_subgraph();
+
+private:
+    detail::graph_ptr my_graph;
+};
+
+void executable_graph::exec_and_wait() { my_queue.wait(); }
+
+template<typename T>
+node graph::add_device_node(T cgf , const std::vector<node>& dep) {
+    node _node(my_graph,cgf);
+    if( !dep.empty() ) {
+        for(auto n : dep) this->make_edge(n,_node);
+    } else {
+        _node.set_root();
+    }
+    return _node;
+}
+
+void graph::make_edge(node sender, node receiver) {
+    sender.register_successor(receiver);//register successor
+    my_graph->remove_root(receiver.my_node); //remove receiver from root node list
+}
+
+void graph::exec_and_wait( sycl::queue q ) {
+    my_graph->exec_and_wait(q);
+};
+
+} // namespace experimental
+} // namespace oneapi
+} // namespace ext
+} // namespace sycl
+} // __SYCL_INLINE_NAMESPACE(cl)
+

From 59bb7da776178997cfe8953e39e1432639a02f8c Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Fri, 18 Feb 2022 15:15:10 -0600
Subject: [PATCH 02/43] Adding initial sycl graph doc

---
 .../SYCL_EXT_ONEAPI_GRAPH.asciidoc            | 290 ++++++++++++++++++
 1 file changed, 290 insertions(+)
 create mode 100644 sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc

diff --git a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
new file mode 100644
index 0000000000000..3bb7051730b7d
--- /dev/null
+++ b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
@@ -0,0 +1,290 @@
+= SYCL_EXT_ONEAPI_GRAPH
+:source-highlighter: coderay
+:coderay-linenums-mode: table
+
+// This section needs to be after the document title.
+:doctype: book
+:toc2:
+:toc: left
+:encoding: utf-8
+:lang: en
+
+:blank: pass:[ +]
+
+// Set the default source code type in this document to C++,
+// for syntax highlighting purposes.  This is needed because
+// docbook uses c++ and html5 uses cpp.
+:language: {basebackend@docbook:c++:cpp}
+
+== Notice
+
+Copyright (c) 2022 Intel Corporation.  All rights reserved.
+
+IMPORTANT: This specification is a draft.
+
+NOTE: Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are
+trademarks of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc.
+used by permission by Khronos.
+
+NOTE: This document is better viewed when rendered as html with asciidoctor.
+GitHub does not render image icons.
+
+This extension is written against the SYCL 2020 revision 4 specification. All
+references below to the "core SYCL specification" or to section numbers in the
+SYCL specification refer to that revision.
+
+NOTE: This extension is experimental: interfaces are subject to change later.
+
+== Introduction
+
+This extension introduces an interface that enables a lazy execution and easy replay of a kernel graph by separating
+Its definition and execution. 
+
+== Feature test macro
+
+This extension provides a feature-test macro as described in the core SYCL
+specification section 6.3.3 "Feature test macros".  Therefore, an
+implementation supporting this extension must predefine the macro
+`SYCL_EXT_ONEAPI_GRAPH` to one of the values defined in the table below.
+Applications can test for the existence of this macro to determine if the
+implementation supports this feature, or applications can test the macro's
+value to determine which of the extension's APIs the implementation supports.
+
+Table 1. Values of the `SYCL_EXT_ONEAPI_GRAPH` macro.
+[%header,cols="1,5"]
+|===
+|Value |Description
+|1     |Initial extension version. Base features are supported.
+|===
+
+== SYCL Graph Terminology
+
+Table 2. Terminology.
+|===
+|Concept|Description
+|graph| Class that stores structured work units and their dependencies
+|node| The unit of work. Can have different attributes.
+|edge| Dependency between work units. Happens before relation.
+|===
+
+== Node
+
+Node is a class that can encapsulate SYCL kernel functions or host tasks for deferred execution.
+A graph has to be created first, the structure of a graph is defined second by adding nodes and edges.
+
+[source,c++]
+----
+namespace sycl::ext::oneapi::experimental {
+
+	class node{
+	};
+}
+----
+
+NOTE:
+
+== Edge
+
+A dependency between two nodes representing a happens before relationship.
+
+[source,c++]
+----
+namespace sycl::ext::oneapi::experimental {
+
+	// Adding dependency between two nodes.
+    	void make_edge(node sender, node receiver);
+}
+----
+
+== Graph
+
+Graph is a class that represents a directed acyclic graph of nodes. 
+A graph can be nested, can have multiple root nodes that are scheduled for execution first and multiple leaf nodes that are scheduled for execution last.
+Member functions as listed in Table 2 and 3 can be used to add nodes to a graph.
+
+[source,c++]
+----
+namespace sycl::ext::oneapi::experimental {
+
+  class graph {
+  };
+
+}
+----
+
+=== Executable Graph
+
+`executable_graph` represents a user generated device and context specific execution object that can be submitted to a queue for execution. 
+The structure of an `executable_graph` object, such as adding nodes or edges, can not be changed.
+Each `executable_graph` object can only be executed once at the same time on its assigned queue.
+
+[source,c++]
+----
+namespace sycl::ext::oneapi::experimental {
+
+  class executable_graph {
+  };
+
+}
+----
+
+
+Table 3. Constructors of the `graph` class.
+|===
+|Constructor|Description
+
+|`graph()`
+|Creates a `graph` object
+
+|`graph(graph& parent)`
+|Creates a nested `graph` object
+
+|===
+
+Table 4. Member functions of the `graph` class.
+|===
+|Member function|Description
+
+|`node add_empty_node(const std::vector<node>& dep = {});`
+|This node holds no task that is scheduled for execution. It's intended use is a synchronization point inside a graph, this node can significantly reduce the number of edges ( O(n) vs. O(n^2) ) .
+
+|`template<typename T>
+    	node add_host_node(T hostTaskCallable, const std::vector<node>& dep = {});`
+|This node captures a host task, a native C++ callable which is scheduled by the SYCL runtime.
+
+|`template<typename T>
+    	node add_device_node(T cgf, const std::vector<node>& dep = {});`
+|This node captures a SYCL function for invoking kernels, with all restrictions that apply as described in the spec.
+
+|`template<typename T>
+    	executable_graph make_executable(const queue& syclQueue);`
+|Returns a queue specific graph object that can be submitted to a queue.
+
+|`template<typename T>
+    	executable_graph make_executable(const device& syclDevice, const context& syclContext);`
+|Returns a device and context specific graph object that can be submitted to a queue.
+
+|===
+
+Table 5. Member functions of the `graph` class (memory operations).
+|===
+|Member function|Description
+
+|`node add_memcpy_node(void* dest, const void* src, size_t numBytes, const std::vector<node>& dep = {});`
+|Adding a node that encapsulates a `memcpy` operation.
+
+|`node add_memset_node(void* ptr, int value, size_t numBytes, const std::vector<node>& dep = {});`
+|Adding a node that encapsulates a `memset` operation.
+
+|`node add_malloc_node(void *data, size_t numBytes, usm::alloc kind, const std::vector<node>& dep = {});`
+|Adding a node that encapsulates a `malloc` operation.
+
+|`node add_free_node(void *data, const std::vector<node>& dep = {});`
+|Adding a node that encapsulates a `free` operation.
+
+|===
+
+
+== Examples
+
+1. Dot product
+
+[source,c++]
+----
+...
+
+#include <sycl/ext/oneapi/experimental/graph.hpp>
+
+int main() {
+  const size_t n = 10;
+  float alpha = 1.0f;
+  float beta = 2.0f;
+  float gamma = 3.0f;
+
+#ifndef POC_IMPL
+  sycl::queue q;
+#else
+  sycl::property_list p{sycl::ext::oneapi::property::queue::lazy_execution{}};
+  sycl::queue q{p};
+#endif
+
+  sycl::ext::oneapi::experimental::graph g;
+
+  float *x = sycl::malloc_shared<float>(n, q);
+  float *y = sycl::malloc_shared<float>(n, q);
+  float *z = sycl::malloc_shared<float>(n, q);
+
+  float *dotp = sycl::malloc_shared<float>(1, q);
+
+  for (int i = 0; i < n; i++) {
+    x[i] = 1.0f;
+    y[i] = 2.0f;
+    z[i] = 3.0f;
+  }
+
+  auto node_a = g.add_device_node([&](sycl::handler &h) {
+    h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
+      const size_t i = it[0];
+      x[i] = alpha * x[i] + beta * y[i];
+    });
+  });
+
+  auto node_b = g.add_device_node([&](sycl::handler &h) {
+    h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
+      const size_t i = it[0];
+      z[i] = gamma * z[i] + beta * y[i];
+    });
+  });
+
+  auto node_c = g.add_device_node(
+      [&](sycl::handler &h) {
+        h.parallel_for(sycl::range<1>{n},
+                       sycl::reduction(dotp, 0.0f, std::plus()),
+                       [=](sycl::id<1> it, auto &sum) {
+                         const size_t i = it[0];
+                         sum += x[i] * z[i];
+                       });
+      },
+      {node_a, node_b});
+
+  auto exec = g.make_exec(q);
+
+#ifndef POC_IMPL
+  q.submit(exec).wait();
+#else
+  exec.exec_and_wait();
+#endif
+
+  sycl::free(x, q);
+  sycl::free(y, q);
+  sycl::free(z, q);
+  sycl::free(dotp, q);
+
+  return 0;
+}
+
+
+...
+----
+
+== Issues for later investigations
+
+. Explicit memory movement can cause POC to stall.
+
+== Non-implemented features
+Please, note that the following features are not yet implemented:
+
+. Level Zero backend only
+. Memory operation nodes not implemented
+. Host node not implemented
+. Submit overload of a queue. `submit(graph)` Use a combination of `executable_graph::exec_and_wait()` and queue property `sycl::ext::oneapi::property::queue::lazy_execution{}` instead.
+
+== Revision History
+
+[cols="5,15,15,70"]
+[grid="rows"]
+[options="header"]
+|========================================
+|Rev|Date|Author|Changes
+|1|2022-02-11|Pablo Reble|Initial public working draft
+|========================================

From 528017a77952d97bee1b4d5b3e723d3f6efb5412 Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Fri, 7 Jan 2022 14:37:00 -0600
Subject: [PATCH 03/43] Reusing command list for re-execution (WIP)

---
 sycl/include/CL/sycl/detail/pi.def        |  1 +
 sycl/include/CL/sycl/detail/pi.h          |  2 +
 sycl/plugins/level_zero/pi_level_zero.cpp | 45 ++++++++++++++++++++++-
 sycl/plugins/level_zero/pi_level_zero.hpp |  2 +
 4 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/sycl/include/CL/sycl/detail/pi.def b/sycl/include/CL/sycl/detail/pi.def
index c9a68c6cadec3..b26c7e5588867 100644
--- a/sycl/include/CL/sycl/detail/pi.def
+++ b/sycl/include/CL/sycl/detail/pi.def
@@ -103,6 +103,7 @@ _PI_API(piSamplerGetInfo)
 _PI_API(piSamplerRetain)
 _PI_API(piSamplerRelease)
 // Queue commands
+_PI_API(piKernelLaunch)
 _PI_API(piEnqueueKernelLaunch)
 _PI_API(piEnqueueNativeKernel)
 _PI_API(piEnqueueEventsWait)
diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h
index 4d3e841f731e1..4cfd760d7a8ab 100644
--- a/sycl/include/CL/sycl/detail/pi.h
+++ b/sycl/include/CL/sycl/detail/pi.h
@@ -1453,6 +1453,8 @@ __SYCL_EXPORT pi_result piSamplerRelease(pi_sampler sampler);
 //
 // Queue Commands
 //
+__SYCL_EXPORT pi_result piKernelLaunch(pi_queue queue);
+
 __SYCL_EXPORT pi_result piEnqueueKernelLaunch(
     pi_queue queue, pi_kernel kernel, pi_uint32 work_dim,
     const size_t *global_work_offset, const size_t *global_work_size,
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index 9e9b304bc84ed..61351246bf05c 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -1454,6 +1454,11 @@ pi_result _pi_queue::getOrCreateCopyCommandQueue(
   return PI_SUCCESS;
 }
 
+bool _pi_queue::isEagerExec() {
+  return false;
+  // (this->PiQueueProperties & (1<<5)) == 0)
+}
+
 // This function will return one of possibly multiple available copy queues.
 // Currently, a round robin strategy is used.
 // This function also sends back the value of CopyQueueIndex and
@@ -4773,7 +4778,7 @@ pi_result piKernelRelease(pi_kernel Kernel) {
 }
 
 pi_result
-piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
+piEnqueueKernel(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
                       const size_t *GlobalWorkOffset,
                       const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
                       pi_uint32 NumEventsInWaitList,
@@ -4906,14 +4911,52 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
   if (IndirectAccessTrackingEnabled)
     Queue->KernelsToBeSubmitted.push_back(Kernel);
 
+#if 0
   // Execute command list asynchronously, as the event will be used
   // to track down its completion.
   if (auto Res = Queue->executeCommandList(CommandList, false, true))
     return Res;
+#endif
 
   return PI_SUCCESS;
 }
 
+pi_result
+piKernelLaunch(pi_queue Queue) {
+    
+    //TODO: Make sure (re-)execute specific command list.
+
+    // Get a new command list to be used on this call
+      pi_command_list_ptr_t CommandList{};
+      if (auto Res = Queue->Context->getAvailableCommandList(
+              Queue, CommandList, false /* PreferCopyEngine */,
+              true /* AllowBatching */))
+        return Res;
+    
+    // Execute command list asynchronously, as the event will be used
+    // to track down its completion.
+    if (auto Res = Queue->executeCommandList(CommandList, false, true))
+      return Res;
+
+    return PI_SUCCESS;
+}
+
+pi_result
+piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
+                      const size_t *GlobalWorkOffset,
+                      const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
+                      pi_uint32 NumEventsInWaitList,
+                      const pi_event *EventWaitList, pi_event *Event) {
+    auto Res =
+    piEnqueueKernel(Queue,Kernel,WorkDim,GlobalWorkOffset,GlobalWorkSize,LocalWorkSize,NumEventsInWaitList,EventWaitList,Event);
+#if 0
+    if(Res == PI_SUCCESS && Queue->isEagerExec()) {
+        return piLazyKernelLaunch(Queue);
+    }
+#endif
+    return Res;
+}
+
 pi_result piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle,
                                             pi_context Context,
                                             pi_program Program,
diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp
index 3bff3389cae1e..d73cf4f427fed 100644
--- a/sycl/plugins/level_zero/pi_level_zero.hpp
+++ b/sycl/plugins/level_zero/pi_level_zero.hpp
@@ -743,6 +743,8 @@ struct _pi_queue : _pi_object {
   // Returns true if the queue is a in-order queue.
   bool isInOrderQueue() const;
 
+  bool isEagerExec();
+
   // adjust the queue's batch size, knowing that the current command list
   // is being closed with a full batch.
   // For copy commands, IsCopy is set to 'true'.

From aee48a541b7582725535d5ef852b385ee805ada7 Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Tue, 15 Feb 2022 17:18:32 -0600
Subject: [PATCH 04/43] Adding lazy execution property to queue

---
 .../CL/sycl/detail/property_helper.hpp        |  3 +-
 sycl/include/CL/sycl/feature_test.hpp.in      |  1 +
 .../CL/sycl/properties/queue_properties.hpp   |  8 +++++
 sycl/plugins/level_zero/pi_level_zero.cpp     | 33 ++++++++++++++-----
 sycl/plugins/level_zero/pi_level_zero.hpp     | 11 +++++--
 sycl/source/detail/queue_impl.cpp             |  8 +++++
 6 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/sycl/include/CL/sycl/detail/property_helper.hpp b/sycl/include/CL/sycl/detail/property_helper.hpp
index 12bc497ee2a70..9dc34de890c69 100644
--- a/sycl/include/CL/sycl/detail/property_helper.hpp
+++ b/sycl/include/CL/sycl/detail/property_helper.hpp
@@ -34,8 +34,9 @@ enum DataLessPropKind {
   InitializeToIdentity = 7,
   UseDefaultStream = 8,
   DiscardEvents = 9,
+  LazyExecution = 10,
   // Indicates the last known dataless property.
-  LastKnownDataLessPropKind = 9,
+  LastKnownDataLessPropKind = 10,
   // Exceeding 32 may cause ABI breaking change on some of OSes.
   DataLessPropKindSize = 32
 };
diff --git a/sycl/include/CL/sycl/feature_test.hpp.in b/sycl/include/CL/sycl/feature_test.hpp.in
index 30164956bb0f0..c3444c94e98de 100644
--- a/sycl/include/CL/sycl/feature_test.hpp.in
+++ b/sycl/include/CL/sycl/feature_test.hpp.in
@@ -47,6 +47,7 @@ namespace sycl {
 #define SYCL_EXT_ONEAPI_FREE_FUNCTION_QUERIES 1
 #define SYCL_EXT_ONEAPI_GROUP_ALGORITHMS 1
 #define SYCL_EXT_ONEAPI_GROUP_SORT 1
+#define SYCL_EXT_ONEAPI_LAZY_QUEUE 1
 #define SYCL_EXT_ONEAPI_MAX_WORK_GROUP_QUERY 1
 #define SYCL_EXT_ONEAPI_ND_RANGE_REDUCTIONS 1
 #define SYCL_EXT_ONEAPI_DEFAULT_CONTEXT 1
diff --git a/sycl/include/CL/sycl/properties/queue_properties.hpp b/sycl/include/CL/sycl/properties/queue_properties.hpp
index 76a3bfaea9373..c9fb6e88d890c 100644
--- a/sycl/include/CL/sycl/properties/queue_properties.hpp
+++ b/sycl/include/CL/sycl/properties/queue_properties.hpp
@@ -28,6 +28,8 @@ namespace property {
 namespace queue {
 class discard_events : public ::cl::sycl::detail::DataLessProperty<
                            ::cl::sycl::detail::DiscardEvents> {};
+class lazy_execution : public ::cl::sycl::detail::DataLessProperty<
+                           ::cl::sycl::detail::LazyExecution> {};
 } // namespace queue
 } // namespace property
 
@@ -63,6 +65,9 @@ template <>
 struct is_property<ext::oneapi::property::queue::discard_events>
     : std::true_type {};
 template <>
+struct is_property<ext::oneapi::property::queue::lazy_execution>
+    : std::true_type {};
+template <>
 struct is_property<property::queue::cuda::use_default_stream> : std::true_type {
 };
 template <>
@@ -78,6 +83,9 @@ template <>
 struct is_property_of<ext::oneapi::property::queue::discard_events, queue>
     : std::true_type {};
 template <>
+struct is_property_of<ext::oneapi::property::queue::lazy_execution, queue>
+    : std::true_type {};
+template <>
 struct is_property_of<property::queue::cuda::use_default_stream, queue>
     : std::true_type {};
 template <>
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index 61351246bf05c..a962f54f9980c 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -838,6 +838,11 @@ bool _pi_queue::isInOrderQueue() const {
   return ((this->Properties & PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) == 0);
 }
 
+bool _pi_queue::isEagerExec() const {
+  // If lazy exec queue property is not set, then it's an eager queue.
+  return ((this->PiQueueProperties & (1<<10) ) == 0);
+}
+
 pi_result _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList,
                                       bool MakeAvailable) {
   bool UseCopyEngine = CommandList->second.isCopy();
@@ -1041,7 +1046,9 @@ _pi_queue::_pi_queue(ze_command_queue_handle_t Queue,
 pi_result
 _pi_context::getAvailableCommandList(pi_queue Queue,
                                      pi_command_list_ptr_t &CommandList,
-                                     bool UseCopyEngine, bool AllowBatching) {
+                                     bool UseCopyEngine, bool AllowBatching, bool Graph) {
+_pi_result pi_result = PI_OUT_OF_RESOURCES;
+if(!Graph && (Queue->CommandListMap.size() == 0)) {
   auto &CommandBatch =
       UseCopyEngine ? Queue->CopyCommandBatch : Queue->ComputeCommandBatch;
   // Handle batching of commands
@@ -1064,7 +1071,7 @@ _pi_context::getAvailableCommandList(pi_queue Queue,
   // the command lists, and later are then added to the command queue.
   // Each command list is paired with an associated fence to track when the
   // command list is available for reuse.
-  _pi_result pi_result = PI_OUT_OF_RESOURCES;
+  //_pi_result pi_result = PI_OUT_OF_RESOURCES;
   ZeStruct<ze_fence_desc_t> ZeFenceDesc;
 
   auto &ZeCommandListCache =
@@ -1172,6 +1179,11 @@ _pi_context::getAvailableCommandList(pi_queue Queue,
     pi_result = PI_SUCCESS;
   }
 
+} else {
+  CommandList = Queue->CommandListMap.begin();
+  pi_result = PI_SUCCESS;
+}
+
   return pi_result;
 }
 
@@ -1230,7 +1242,7 @@ void _pi_queue::adjustBatchSizeForPartialBatch(bool IsCopy) {
 
 pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList,
                                         bool IsBlocking,
-                                        bool OKToBatchCommand) {
+                                        bool OKToBatchCommand, bool Graph) {
   int Index = CommandList->second.CopyQueueIndex;
   bool UseCopyEngine = (Index != -1);
   if (UseCopyEngine)
@@ -1374,6 +1386,8 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList,
   }
 
   // Close the command list and have it ready for dispatch.
+  // TODO: Close command list only once before initial execution, but works as is.
+  //if(!Graph)
   ZE_CALL(zeCommandListClose, (CommandList->first));
   // Offload command list to the GPU for asynchronous execution
   auto ZeCommandList = CommandList->first;
@@ -4923,19 +4937,22 @@ piEnqueueKernel(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
 
 pi_result
 piKernelLaunch(pi_queue Queue) {
-    
+   
+    const bool Graph = !(Queue->isEagerExec());
+    //const bool Graph = true;
+ 
     //TODO: Make sure (re-)execute specific command list.
 
     // Get a new command list to be used on this call
       pi_command_list_ptr_t CommandList{};
       if (auto Res = Queue->Context->getAvailableCommandList(
               Queue, CommandList, false /* PreferCopyEngine */,
-              true /* AllowBatching */))
+              true /* AllowBatching */, Graph /* Shortcut for Graph */))
         return Res;
     
     // Execute command list asynchronously, as the event will be used
     // to track down its completion.
-    if (auto Res = Queue->executeCommandList(CommandList, false, true))
+    if (auto Res = Queue->executeCommandList(CommandList, false, true, Graph))
       return Res;
 
     return PI_SUCCESS;
@@ -4949,9 +4966,9 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
                       const pi_event *EventWaitList, pi_event *Event) {
     auto Res =
     piEnqueueKernel(Queue,Kernel,WorkDim,GlobalWorkOffset,GlobalWorkSize,LocalWorkSize,NumEventsInWaitList,EventWaitList,Event);
-#if 0
+#if 1
     if(Res == PI_SUCCESS && Queue->isEagerExec()) {
-        return piLazyKernelLaunch(Queue);
+        return piKernelLaunch(Queue);
     }
 #endif
     return Res;
diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp
index d73cf4f427fed..9a294cb0ad649 100644
--- a/sycl/plugins/level_zero/pi_level_zero.hpp
+++ b/sycl/plugins/level_zero/pi_level_zero.hpp
@@ -541,6 +541,10 @@ struct _pi_context : _pi_object {
   std::unordered_map<ze_device_handle_t, std::list<ze_command_list_handle_t>>
       ZeCopyCommandListCache;
 
+  // Single command list for graph api
+
+  std::list<ze_command_list_handle_t> ZeGraphCommandList;
+
   // Retrieves a command list for executing on this device along with
   // a fence to be used in tracking the execution of this command list.
   // If a command list has been created on this device which has
@@ -558,7 +562,8 @@ struct _pi_context : _pi_object {
   pi_result getAvailableCommandList(pi_queue Queue,
                                     pi_command_list_ptr_t &CommandList,
                                     bool UseCopyEngine = false,
-                                    bool AllowBatching = false);
+                                    bool AllowBatching = false,
+                                    bool Graph = false);
 
   // Get index of the free slot in the available pool. If there is no available
   // pool then create new one. The HostVisible parameter tells if we need a
@@ -743,7 +748,7 @@ struct _pi_queue : _pi_object {
   // Returns true if the queue is a in-order queue.
   bool isInOrderQueue() const;
 
-  bool isEagerExec();
+  bool isEagerExec() const;
 
   // adjust the queue's batch size, knowing that the current command list
   // is being closed with a full batch.
@@ -782,7 +787,7 @@ struct _pi_queue : _pi_object {
   // of the value of OKToBatchCommand
   pi_result executeCommandList(pi_command_list_ptr_t CommandList,
                                bool IsBlocking = false,
-                               bool OKToBatchCommand = false);
+                               bool OKToBatchCommand = false, bool Graph = false);
 
   // If there is an open command list associated with this queue,
   // close it, execute it, and reset the corresponding OpenCommandList.
diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index f786a87ef98c0..1fada9904c6cb 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -274,6 +274,14 @@ void queue_impl::wait(const detail::code_location &CodeLoc) {
   TelemetryEvent = instrumentationProlog(CodeLoc, Name, StreamID, IId);
 #endif
 
+#if 1
+  if(has_property<ext::oneapi::property::queue::lazy_execution>()){
+    const detail::plugin &Plugin = getPlugin();
+    if (Plugin.getBackend() == backend::ext_oneapi_level_zero)
+        Plugin.call<detail::PiApiKind::piKernelLaunch>(getHandleRef());
+  }
+#endif
+
   std::vector<std::weak_ptr<event_impl>> WeakEvents;
   std::vector<event> SharedEvents;
   {

From 24fa5a9a24188dd31281b8e2bfb3f0be68b5cea4 Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Mon, 21 Feb 2022 22:25:38 -0600
Subject: [PATCH 05/43] fix merge

---
 sycl/plugins/level_zero/pi_level_zero.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index a962f54f9980c..5d6ebb4046c57 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -840,7 +840,7 @@ bool _pi_queue::isInOrderQueue() const {
 
 bool _pi_queue::isEagerExec() const {
   // If lazy exec queue property is not set, then it's an eager queue.
-  return ((this->PiQueueProperties & (1<<10) ) == 0);
+  return ((this->Properties & (1<<10) ) == 0);
 }
 
 pi_result _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList,

From b7ce27191f6d721997236584946456d6c9e8a347 Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo@reble.org>
Date: Tue, 22 Feb 2022 10:46:54 -0600
Subject: [PATCH 06/43] Update pi_level_zero.cpp

Fix merge conflict
---
 sycl/plugins/level_zero/pi_level_zero.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index 5d6ebb4046c57..fbf8f0ae69601 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -1468,11 +1468,6 @@ pi_result _pi_queue::getOrCreateCopyCommandQueue(
   return PI_SUCCESS;
 }
 
-bool _pi_queue::isEagerExec() {
-  return false;
-  // (this->PiQueueProperties & (1<<5)) == 0)
-}
-
 // This function will return one of possibly multiple available copy queues.
 // Currently, a round robin strategy is used.
 // This function also sends back the value of CopyQueueIndex and

From f3d30edf7c555ef7b19b2505768c75cec9a6d23e Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Fri, 11 Mar 2022 16:42:43 +0100
Subject: [PATCH 07/43] update extension proposal started to incorporate
 feedback

---
 .../SYCL_EXT_ONEAPI_GRAPH.asciidoc            | 92 ++++++++-----------
 1 file changed, 38 insertions(+), 54 deletions(-)

diff --git a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
index 3bb7051730b7d..efe81d24b767d 100644
--- a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
+++ b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
@@ -69,15 +69,15 @@ Table 2. Terminology.
 
 == Node
 
-Node is a class that can encapsulate SYCL kernel functions or host tasks for deferred execution.
+Node is a class that encapsulates tasks like SYCL kernel functions or host tasks for deferred execution.
 A graph has to be created first, the structure of a graph is defined second by adding nodes and edges.
 
 [source,c++]
 ----
 namespace sycl::ext::oneapi::experimental {
 
-	class node{
-	};
+  class node{
+  };
 }
 ----
 
@@ -85,7 +85,7 @@ NOTE:
 
 == Edge
 
-A dependency between two nodes representing a happens before relationship.
+A dependency between two nodes representing a happens before relationship. `sender` and `receiver` may be accociated to different graphs.
 
 [source,c++]
 ----
@@ -99,45 +99,46 @@ namespace sycl::ext::oneapi::experimental {
 == Graph
 
 Graph is a class that represents a directed acyclic graph of nodes. 
-A graph can be nested, can have multiple root nodes that are scheduled for execution first and multiple leaf nodes that are scheduled for execution last.
+A graph can have different states, can be nested, can have multiple root nodes that are scheduled for execution first and multiple leaf nodes that are scheduled for execution last. The execution of a graph has been completed when all leaf node tasks have been completed.
 Member functions as listed in Table 2 and 3 can be used to add nodes to a graph.
 
 [source,c++]
 ----
 namespace sycl::ext::oneapi::experimental {
 
-  class graph {
+  enum class graph_state{
+    modifiable,
+    executable
   };
 
+  template<graph_state State>
+  class graph {
+  public:
+    operator graph<graph_state::executable>();
+  };
+  
+  graph<graph_state::modifiable> make_graph();
+  
+  graph<graph_state::modifiable> compile(const graph<graph_state::executable> Graph);
+  
 }
-----
 
-=== Executable Graph
+sycl::event sycl::queue(const graph<graph_state::executable> Graph);
 
-`executable_graph` represents a user generated device and context specific execution object that can be submitted to a queue for execution. 
-The structure of an `executable_graph` object, such as adding nodes or edges, can not be changed.
-Each `executable_graph` object can only be executed once at the same time on its assigned queue.
-
-[source,c++]
 ----
-namespace sycl::ext::oneapi::experimental {
-
-  class executable_graph {
-  };
 
-}
-----
+=== Executable Graph
 
+A `graph` object in `graph_state::executable` represents a user generated device and context specific execution object that is submitted to a queue for execution.
+The structure of such a `graph` object in this state is immutable and can not be changed, so are the tasks assigned with each node.
+Support of submitting a graph for execution, before a previous execution has been completed is backend specific. The runtime may throw an error.
 
-Table 3. Constructors of the `graph` class.
+Table 3. Constructor of the `graph` class.
 |===
 |Constructor|Description
 
 |`graph()`
-|Creates a `graph` object
-
-|`graph(graph& parent)`
-|Creates a nested `graph` object
+|Creates a `graph` object. It's default state is `graph_state::modifiable`.
 
 |===
 
@@ -145,24 +146,12 @@ Table 4. Member functions of the `graph` class.
 |===
 |Member function|Description
 
-|`node add_empty_node(const std::vector<node>& dep = {});`
-|This node holds no task that is scheduled for execution. It's intended use is a synchronization point inside a graph, this node can significantly reduce the number of edges ( O(n) vs. O(n^2) ) .
-
-|`template<typename T>
-    	node add_host_node(T hostTaskCallable, const std::vector<node>& dep = {});`
-|This node captures a host task, a native C++ callable which is scheduled by the SYCL runtime.
+|`node add_node(const std::vector<node>& dep = {});`
+|This creates an empty node which is associated to no task. It's intended use is either a connection point inside a graph between groups of nodes, and can significantly reduce the number of edges ( O(n) vs. O(n^2) ). Another use-case is building the structure of a graph first and adding tasks later.
 
 |`template<typename T>
-    	node add_device_node(T cgf, const std::vector<node>& dep = {});`
-|This node captures a SYCL function for invoking kernels, with all restrictions that apply as described in the spec.
-
-|`template<typename T>
-    	executable_graph make_executable(const queue& syclQueue);`
-|Returns a queue specific graph object that can be submitted to a queue.
-
-|`template<typename T>
-    	executable_graph make_executable(const device& syclDevice, const context& syclContext);`
-|Returns a device and context specific graph object that can be submitted to a queue.
+    	node add_node(T cgf, const std::vector<node>& dep = {});`
+|This node captures a command group function object containing host task which is scheduled by the SYCL runtime or a SYCL function for invoking kernels with all restrictions that apply as described in the spec.
 
 |===
 
@@ -187,6 +176,8 @@ Table 5. Member functions of the `graph` class (memory operations).
 
 == Examples
 
+NOTE: The examples below demonstrate intended usage of the extension, but are not compatible with the proof-of-concept implementation. The proof-of-concept implementation currently requires different syntax, as described in the "Non-implemented features" section at the end of this document.
+
 1. Dot product
 
 [source,c++]
@@ -201,14 +192,9 @@ int main() {
   float beta = 2.0f;
   float gamma = 3.0f;
 
-#ifndef POC_IMPL
   sycl::queue q;
-#else
-  sycl::property_list p{sycl::ext::oneapi::property::queue::lazy_execution{}};
-  sycl::queue q{p};
-#endif
 
-  sycl::ext::oneapi::experimental::graph g;
+  auto g = sycl::ext::oneapi::experimental::make_graph();
 
   float *x = sycl::malloc_shared<float>(n, q);
   float *y = sycl::malloc_shared<float>(n, q);
@@ -222,21 +208,21 @@ int main() {
     z[i] = 3.0f;
   }
 
-  auto node_a = g.add_device_node([&](sycl::handler &h) {
+  auto node_a = g.add_node([&](sycl::handler &h) {
     h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
       const size_t i = it[0];
       x[i] = alpha * x[i] + beta * y[i];
     });
   });
 
-  auto node_b = g.add_device_node([&](sycl::handler &h) {
+  auto node_b = g.add_node([&](sycl::handler &h) {
     h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
       const size_t i = it[0];
       z[i] = gamma * z[i] + beta * y[i];
     });
   });
 
-  auto node_c = g.add_device_node(
+  auto node_c = g.add_node(
       [&](sycl::handler &h) {
         h.parallel_for(sycl::range<1>{n},
                        sycl::reduction(dotp, 0.0f, std::plus()),
@@ -247,13 +233,9 @@ int main() {
       },
       {node_a, node_b});
 
-  auto exec = g.make_exec(q);
+  auto exec = compile(q);
 
-#ifndef POC_IMPL
   q.submit(exec).wait();
-#else
-  exec.exec_and_wait();
-#endif
 
   sycl::free(x, q);
   sycl::free(y, q);
@@ -278,6 +260,7 @@ Please, note that the following features are not yet implemented:
 . Memory operation nodes not implemented
 . Host node not implemented
 . Submit overload of a queue. `submit(graph)` Use a combination of `executable_graph::exec_and_wait()` and queue property `sycl::ext::oneapi::property::queue::lazy_execution{}` instead.
+. `class graph<graph_state>` Use dedicated `class graph` (equivalent to `graph_state == modifiable`) and `class executable_graph` (equivalent to `graph_state == executable`) instead.
 
 == Revision History
 
@@ -287,4 +270,5 @@ Please, note that the following features are not yet implemented:
 |========================================
 |Rev|Date|Author|Changes
 |1|2022-02-11|Pablo Reble|Initial public working draft
+|2|2022-03-11|Pablo Reble|Incorporate feedback from PR
 |========================================

From 0e96d12ff3d438abeeab69a325cc9dbee4578b5d Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Fri, 11 Mar 2022 20:47:16 +0100
Subject: [PATCH 08/43] typo

---
 sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
index efe81d24b767d..28e1d78f5de1b 100644
--- a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
+++ b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
@@ -85,7 +85,7 @@ NOTE:
 
 == Edge
 
-A dependency between two nodes representing a happens before relationship. `sender` and `receiver` may be accociated to different graphs.
+A dependency between two nodes representing a happens before relationship. `sender` and `receiver` may be associated to different graphs.
 
 [source,c++]
 ----

From 8f1a8dc559d7e3f592c8bcd1b089e9ce6a82a27d Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo@reble.org>
Date: Mon, 14 Mar 2022 14:08:02 +0100
Subject: [PATCH 09/43] Apply suggestions from code review

Co-authored-by: Ronan Keryell <ronan@keryell.fr>
---
 .../extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
index 28e1d78f5de1b..960d55e7c282f 100644
--- a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
+++ b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
@@ -129,7 +129,7 @@ sycl::event sycl::queue(const graph<graph_state::executable> Graph);
 
 === Executable Graph
 
-A `graph` object in `graph_state::executable` represents a user generated device and context specific execution object that is submitted to a queue for execution.
+A `graph` object in `graph_state::executable` represents a user-generated device and context specific execution object that is submitted to a queue for execution.
 The structure of such a `graph` object in this state is immutable and can not be changed, so are the tasks assigned with each node.
 Support of submitting a graph for execution, before a previous execution has been completed is backend specific. The runtime may throw an error.
 
@@ -147,7 +147,7 @@ Table 4. Member functions of the `graph` class.
 |Member function|Description
 
 |`node add_node(const std::vector<node>& dep = {});`
-|This creates an empty node which is associated to no task. It's intended use is either a connection point inside a graph between groups of nodes, and can significantly reduce the number of edges ( O(n) vs. O(n^2) ). Another use-case is building the structure of a graph first and adding tasks later.
+|This creates an empty node which is associated to no task. Its intended use is either a connection point inside a graph between groups of nodes, and can significantly reduce the number of edges ( O(n) vs. O(n^2) ). Another use-case is building the structure of a graph first and adding tasks later.
 
 |`template<typename T>
     	node add_node(T cgf, const std::vector<node>& dep = {});`

From a8c72654e7ab1ab34a7da1d0c775c0667b2eb0dd Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Mon, 2 May 2022 21:06:42 -0500
Subject: [PATCH 10/43] fix typos and syntax issues

---
 .../experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc        | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
index 28e1d78f5de1b..4b0a5ea805d35 100644
--- a/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
+++ b/sycl/doc/extensions/experimental/SYCL_EXT_ONEAPI_GRAPH.asciidoc
@@ -38,7 +38,7 @@ NOTE: This extension is experimental: interfaces are subject to change later.
 == Introduction
 
 This extension introduces an interface that enables a lazy execution and easy replay of a kernel graph by separating
-Its definition and execution. 
+its definition and execution.
 
 == Feature test macro
 
@@ -64,7 +64,7 @@ Table 2. Terminology.
 |Concept|Description
 |graph| Class that stores structured work units and their dependencies
 |node| The unit of work. Can have different attributes.
-|edge| Dependency between work units. Happens before relation.
+|edge| Dependency between work units. Happens-before relation.
 |===
 
 == Node
@@ -85,7 +85,7 @@ NOTE:
 
 == Edge
 
-A dependency between two nodes representing a happens before relationship. `sender` and `receiver` may be associated to different graphs.
+A dependency between two nodes representing a happens-before relationship. `sender` and `receiver` may be associated to different graphs.
 
 [source,c++]
 ----
@@ -119,7 +119,7 @@ namespace sycl::ext::oneapi::experimental {
   
   graph<graph_state::modifiable> make_graph();
   
-  graph<graph_state::modifiable> compile(const graph<graph_state::executable> Graph);
+  graph<graph_state::executable> compile(const graph<graph_state::modifiable> Graph);
   
 }
 
@@ -130,7 +130,7 @@ sycl::event sycl::queue(const graph<graph_state::executable> Graph);
 === Executable Graph
 
 A `graph` object in `graph_state::executable` represents a user generated device and context specific execution object that is submitted to a queue for execution.
-The structure of such a `graph` object in this state is immutable and can not be changed, so are the tasks assigned with each node.
+The structure of such a `graph` object in this state is immutable and cannot be changed, so are the tasks assigned with each node.
 Support of submitting a graph for execution, before a previous execution has been completed is backend specific. The runtime may throw an error.
 
 Table 3. Constructor of the `graph` class.

From 60507c10acf56c5624a41f95b4f7637d9b046cb1 Mon Sep 17 00:00:00 2001
From: Julian Miller <julian.miller@intel.com>
Date: Tue, 3 May 2022 11:29:34 -0500
Subject: [PATCH 11/43] Propagate lazy queue property

---
 sycl/include/CL/sycl/detail/pi.h          | 1 +
 sycl/plugins/level_zero/pi_level_zero.cpp | 9 +++++----
 sycl/source/detail/queue_impl.hpp         | 3 +++
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h
index 4cfd760d7a8ab..2eeadb295d086 100644
--- a/sycl/include/CL/sycl/detail/pi.h
+++ b/sycl/include/CL/sycl/detail/pi.h
@@ -602,6 +602,7 @@ constexpr pi_queue_properties PI_QUEUE_PROFILING_ENABLE =
 constexpr pi_queue_properties PI_QUEUE_ON_DEVICE = CL_QUEUE_ON_DEVICE;
 constexpr pi_queue_properties PI_QUEUE_ON_DEVICE_DEFAULT =
     CL_QUEUE_ON_DEVICE_DEFAULT;
+constexpr pi_queue_properties PI_QUEUE_LAZY_EXECUTION = 1 << 10;
 
 using pi_result = _pi_result;
 using pi_platform_info = _pi_platform_info;
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index fbf8f0ae69601..c4d30a3c65d96 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -3088,10 +3088,11 @@ pi_result piQueueCreate(pi_context Context, pi_device Device,
                         pi_queue_properties Properties, pi_queue *Queue) {
 
   // Check that unexpected bits are not set.
-  PI_ASSERT(!(Properties & ~(PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
-                             PI_QUEUE_PROFILING_ENABLE | PI_QUEUE_ON_DEVICE |
-                             PI_QUEUE_ON_DEVICE_DEFAULT)),
-            PI_INVALID_VALUE);
+  PI_ASSERT(
+      !(Properties & ~(PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
+                       PI_QUEUE_PROFILING_ENABLE | PI_QUEUE_ON_DEVICE |
+                       PI_QUEUE_ON_DEVICE_DEFAULT | PI_QUEUE_LAZY_EXECUTION)),
+      PI_INVALID_VALUE);
 
   ze_device_handle_t ZeDevice;
   ze_command_queue_handle_t ZeComputeCommandQueue;
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index 1d4d66a2b8b20..89a1c09c66de7 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -305,6 +305,9 @@ class queue_impl {
             ext::oneapi::cuda::property::queue::use_default_stream>()) {
       CreationFlags |= __SYCL_PI_CUDA_USE_DEFAULT_STREAM;
     }
+    if (has_property<ext::oneapi::property::queue::lazy_execution>()) {
+      CreationFlags |= PI_QUEUE_LAZY_EXECUTION;
+    }
     RT::PiQueue Queue{};
     RT::PiContext Context = MContext->getHandleRef();
     RT::PiDevice Device = MDevice->getHandleRef();

From d0069e51192f3230fa5c4edf8e799c57e54ad5db Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Wed, 25 May 2022 16:35:57 -0500
Subject: [PATCH 12/43] Update pi_level_zero.cpp

---
 sycl/plugins/level_zero/pi_level_zero.cpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index c4d30a3c65d96..b6ffce8ab4eec 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -4960,14 +4960,15 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
                       const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
                       pi_uint32 NumEventsInWaitList,
                       const pi_event *EventWaitList, pi_event *Event) {
-    auto Res =
-    piEnqueueKernel(Queue,Kernel,WorkDim,GlobalWorkOffset,GlobalWorkSize,LocalWorkSize,NumEventsInWaitList,EventWaitList,Event);
-#if 1
-    if(Res == PI_SUCCESS && Queue->isEagerExec()) {
-        return piKernelLaunch(Queue);
+    if (auto Res =
+    piEnqueueKernel(Queue,Kernel,WorkDim,GlobalWorkOffset,GlobalWorkSize,LocalWorkSize,NumEventsInWaitList,EventWaitList,Event))
+        return Res;
+    if(Queue->isEagerExec()) {
+      if(auto Res = piKernelLaunch(Queue))
+        return Res;
     }
-#endif
-    return Res;
+
+    return PI_SUCCESS;
 }
 
 pi_result piextKernelCreateWithNativeHandle(pi_native_handle NativeHandle,

From ed74e86460e3116a49ef61c3ecdd02b82d7f0224 Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Wed, 25 May 2022 16:44:53 -0500
Subject: [PATCH 13/43] Update pi_level_zero.cpp

---
 sycl/plugins/level_zero/pi_level_zero.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index b6ffce8ab4eec..fbccd8ca64f7e 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -4921,12 +4921,12 @@ piEnqueueKernel(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
   if (IndirectAccessTrackingEnabled)
     Queue->KernelsToBeSubmitted.push_back(Kernel);
 
-#if 0
+  if (Queue->isEagerExec()) {
   // Execute command list asynchronously, as the event will be used
   // to track down its completion.
-  if (auto Res = Queue->executeCommandList(CommandList, false, true))
+  if (auto Res = Queue->executeCommandList(CommandList, false, true, false))
     return Res;
-#endif
+  }
 
   return PI_SUCCESS;
 }
@@ -4963,11 +4963,13 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
     if (auto Res =
     piEnqueueKernel(Queue,Kernel,WorkDim,GlobalWorkOffset,GlobalWorkSize,LocalWorkSize,NumEventsInWaitList,EventWaitList,Event))
         return Res;
+#if 0
     if(Queue->isEagerExec()) {
       if(auto Res = piKernelLaunch(Queue))
         return Res;
     }
-
+#endif
+  
     return PI_SUCCESS;
 }
 

From 38a6fda5111b0bcad525fd21674512bb1d2c44d0 Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Thu, 26 May 2022 10:28:13 -0500
Subject: [PATCH 14/43] Avoid flag redefinition and fix semantics

---
 sycl/plugins/level_zero/pi_level_zero.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index fbccd8ca64f7e..36284e0c5e8be 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -840,7 +840,7 @@ bool _pi_queue::isInOrderQueue() const {
 
 bool _pi_queue::isEagerExec() const {
   // If lazy exec queue property is not set, then it's an eager queue.
-  return ((this->Properties & (1<<10) ) == 0);
+  return ((this->Properties & PI_QUEUE_LAZY_EXECUTION ) != 0);
 }
 
 pi_result _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList,

From 058c875e8944a88b8a17b690f37259ab8d94cf9a Mon Sep 17 00:00:00 2001
From: Pablo Reble <pablo.reble@intel.com>
Date: Thu, 26 May 2022 10:53:09 -0500
Subject: [PATCH 15/43] Update pi_level_zero.cpp

---
 sycl/plugins/level_zero/pi_level_zero.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index 36284e0c5e8be..22c47349ec410 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -840,7 +840,7 @@ bool _pi_queue::isInOrderQueue() const {
 
 bool _pi_queue::isEagerExec() const {
   // If lazy exec queue property is not set, then it's an eager queue.
-  return ((this->Properties & PI_QUEUE_LAZY_EXECUTION ) != 0);
+  return ((this->Properties & PI_QUEUE_LAZY_EXECUTION ) == 0);
 }
 
 pi_result _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList,

From ff540ee5cdb654f246bc6e0ccd52d52443774217 Mon Sep 17 00:00:00 2001
From: u154255 <u154255@s001-n162.aidevcloud>
Date: Wed, 15 Jun 2022 07:21:53 -0700
Subject: [PATCH 16/43] replaced with Julian's graph.hpp

---
 .../sycl/ext/oneapi/experimental/graph.hpp    | 1042 +++++++++++++++--
 1 file changed, 933 insertions(+), 109 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
index 08a7d094e9054..b4e29e132325b 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
@@ -10,8 +10,8 @@
 
 #include <CL/sycl/detail/defines_elementary.hpp>
 
-#include <set>
 #include <list>
+#include <set>
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
@@ -29,90 +29,113 @@ using node_ptr = std::shared_ptr<node_impl>;
 using graph_ptr = std::shared_ptr<graph_impl>;
 
 class wrapper {
-    using T = std::function<void(sycl::handler&)>;
-    T my_func;
-    std::vector<sycl::event> my_deps;
+  using T = std::function<void(sycl::handler &)>;
+  T my_func;
+  std::vector<sycl::event> my_deps;
+
 public:
-    wrapper(T t, const std::vector<sycl::event>& deps) : my_func(t), my_deps(deps) {};
+  wrapper(T t, const std::vector<sycl::event> &deps)
+      : my_func(t), my_deps(deps){};
 
-    void operator()(sycl::handler& cgh) {
-        cgh.depends_on(my_deps);
-        std::invoke(my_func,cgh);
-    }
+  void operator()(sycl::handler &cgh) {
+    cgh.depends_on(my_deps);
+    std::invoke(my_func, cgh);
+  }
 };
 
 struct node_impl {
-    bool is_scheduled;
+  bool is_scheduled;
+  bool is_empty;
 
-    graph_ptr my_graph;
-    sycl::event my_event;
+  graph_ptr my_graph;
+  sycl::event my_event;
 
-    std::vector<node_ptr> my_successors;
-    std::vector<node_ptr> my_predecessors;
+  std::vector<node_ptr> my_successors;
+  std::vector<node_ptr> my_predecessors;
 
-    std::function<void(sycl::handler&)> my_body;
+  std::function<void(sycl::handler &)> my_body;
 
-    void exec( sycl::queue q ) {
-        std::vector<sycl::event> __deps;
-        for(auto i:my_predecessors) __deps.push_back(i->get_event());
-        my_event = q.submit(wrapper{my_body,__deps});
+  void exec(sycl::queue q) {
+    std::vector<sycl::event> __deps;
+    std::vector<node_ptr> pred_nodes = my_predecessors;
+    while (!pred_nodes.empty()) {
+      node_ptr curr_node = pred_nodes.back();
+      pred_nodes.pop_back();
+      // Add predecessors to dependence list if node is empty
+      if (curr_node->is_empty)
+        for (auto i : curr_node->my_predecessors)
+          pred_nodes.push_back(i);
+      else
+        __deps.push_back(curr_node->get_event());
     }
+    if (my_body && !is_empty)
+      my_event = q.submit(wrapper{my_body, __deps});
+  }
 
-    void register_successor(node_ptr n) {
-        my_successors.push_back(n);
-        n->register_predecessor(node_ptr(this));
-    }
+  void register_successor(node_ptr n) {
+    my_successors.push_back(n);
+    n->register_predecessor(node_ptr(this));
+  }
+
+  void register_predecessor(node_ptr n) { my_predecessors.push_back(n); }
+
+  sycl::event get_event(void) { return my_event; }
 
-    void register_predecessor(node_ptr n) { my_predecessors.push_back(n); }
+  node_impl() : is_scheduled(false), is_empty(true) {}
 
-    sycl::event get_event(void) {return my_event;}
+  node_impl(graph_ptr g) : is_scheduled(false), is_empty(true), my_graph(g) {}
 
-    template<typename T>
-    node_impl(graph_ptr g, T cgf) : is_scheduled(false), my_graph(g), my_body(cgf) {}
+  template <typename T>
+  node_impl(graph_ptr g, T cgf)
+      : is_scheduled(false), is_empty(false), my_graph(g), my_body(cgf) {}
 
-    // Recursively adding nodes to execution stack:
-    void topology_sort(std::list<node_ptr>& schedule) {
-        is_scheduled = true;
-        for(auto i:my_successors) {
-            if(!i->is_scheduled) i->topology_sort(schedule);
-        }
-        schedule.push_front(node_ptr(this));
+  // Recursively adding nodes to execution stack:
+  void topology_sort(std::list<node_ptr> &schedule) {
+    is_scheduled = true;
+    for (auto i : my_successors) {
+      if (!i->is_scheduled)
+        i->topology_sort(schedule);
     }
+    schedule.push_front(node_ptr(this));
+  }
 };
 
 struct graph_impl {
-    std::set<node_ptr> my_roots;
-    std::list<node_ptr> my_schedule;
-
-    graph_ptr parent;
-
-    void exec( sycl::queue q ) {
-        if( my_schedule.empty() ) {
-            for(auto n : my_roots) {
-                n->topology_sort(my_schedule);
-            }
-        }
-        for(auto n : my_schedule) n->exec(q);
-    }
+  std::set<node_ptr> my_roots;
+  std::list<node_ptr> my_schedule;
 
-    void exec_and_wait( sycl::queue q ) {
-        exec(q);
-        q.wait();
-    }
+  graph_ptr parent;
 
-    void add_root(node_ptr n) {
-        my_roots.insert(n);
-        for(auto n : my_schedule) n->is_scheduled=false;
-        my_schedule.clear();
+  void exec(sycl::queue q) {
+    if (my_schedule.empty()) {
+      for (auto n : my_roots) {
+        n->topology_sort(my_schedule);
+      }
     }
+    for (auto n : my_schedule)
+      n->exec(q);
+  }
 
-    void remove_root(node_ptr n) {
-        my_roots.erase(n);
-        for(auto n : my_schedule) n->is_scheduled=false;
-        my_schedule.clear();
-    }
+  void exec_and_wait(sycl::queue q) {
+    exec(q);
+    q.wait();
+  }
+
+  void add_root(node_ptr n) {
+    my_roots.insert(n);
+    for (auto n : my_schedule)
+      n->is_scheduled = false;
+    my_schedule.clear();
+  }
 
-    graph_impl() {}
+  void remove_root(node_ptr n) {
+    my_roots.erase(n);
+    for (auto n : my_schedule)
+      n->is_scheduled = false;
+    my_schedule.clear();
+  }
+
+  graph_impl() {}
 };
 
 } // namespace detail
@@ -124,89 +147,890 @@ class graph;
 class executable_graph;
 
 struct node {
-    // TODO: add properties to distinguish between empty, host, device nodes.
-    detail::node_ptr my_node;
-    detail::graph_ptr my_graph;
+  // TODO: add properties to distinguish between empty, host, device nodes.
+  detail::node_ptr my_node;
+  detail::graph_ptr my_graph;
+
+  node() : my_node(new detail::node_impl()) {}
+
+  node(detail::graph_ptr g) : my_graph(g), my_node(new detail::node_impl(g)){};
 
-    template<typename T>
-    node(detail::graph_ptr g, T cgf) : my_graph(g), my_node(new detail::node_impl(g,cgf)) {};
-    void register_successor(node n) { my_node->register_successor(n.my_node); }
-    void exec( sycl::queue q, sycl::event = sycl::event() ) { my_node->exec(q); }
+  template <typename T>
+  node(detail::graph_ptr g, T cgf)
+      : my_graph(g), my_node(new detail::node_impl(g, cgf)){};
 
-    void set_root() {  my_graph->add_root(my_node);}
+  template <typename T> void update(T cgf) {
+    my_node->is_scheduled = false;
+    my_node->is_empty = false;
+    my_node->my_body = cgf;
+  };
 
-    // TODO: Add query functions: is_root, ...
+  void register_successor(node n) { my_node->register_successor(n.my_node); }
+  void exec(sycl::queue q, sycl::event = sycl::event()) { my_node->exec(q); }
+
+  void set_root() { my_graph->add_root(my_node); }
+
+  // TODO: Add query functions: is_root, ...
 };
 
 class executable_graph {
 public:
-    int my_tag;
-    sycl::queue my_queue;
+  int my_tag;
+  sycl::queue my_queue;
 
-    void exec_and_wait();// { my_queue.wait(); }
+  void exec_and_wait(); // { my_queue.wait(); }
 
-    executable_graph(detail::graph_ptr g, sycl::queue q) : my_queue(q), my_tag(rand()) {
-      g->exec(my_queue);
-    }
+  executable_graph(detail::graph_ptr g, sycl::queue q)
+      : my_queue(q), my_tag(rand()) {
+    g->exec(my_queue);
+  }
 };
 
 class graph {
 public:
-    // Adding empty node with [0..n] predecessors:
-    node add_empty_node(const std::vector<node>& dep = {});
+  // Adds a node
+  template <typename T> node add_node(T cgf, const std::vector<node> &dep = {});
+
+  template <typename T>
+  void add_node(node &Node, T cgf, const std::vector<node> &dep = {});
+
+  void add_node(node &Node, const std::vector<node> &dep = {});
+
+  // Adds an empty node
+  node add_node(const std::vector<node> &dep = {});
+
+  // Updates a node
+  void update_node(node &Node, const std::vector<node> &dep = {});
+  template <typename T>
+  void update_node(node &Node, T cgf, const std::vector<node> &dep = {});
+
+  // Shortcuts to add graph nodes
+
+  // Adds a fill node
+  template <typename T>
+  node fill(void *Ptr, const T &Pattern, size_t Count,
+            const std::vector<node> &dep = {});
+  template <typename T>
+  void fill(node &Node, void *Ptr, const T &Pattern, size_t Count,
+            const std::vector<node> &dep = {});
+
+  // Adds a memset node
+  node memset(void *Ptr, int Value, size_t Count,
+              const std::vector<node> &dep = {});
+  void memset(node &Node, void *Ptr, int Value, size_t Count,
+              const std::vector<node> &dep = {});
+
+  // Adds a memcpy node
+  node memcpy(void *Dest, const void *Src, size_t Count,
+              const std::vector<node> &dep = {});
+  void memcpy(node &Node, void *Dest, const void *Src, size_t Count,
+              const std::vector<node> &dep = {});
+
+  // Adds a copy node
+  template <typename T>
+  node copy(const T *Src, T *Dest, size_t Count,
+            const std::vector<node> &dep = {});
+  template <typename T>
+  void copy(node &Node, const T *Src, T *Dest, size_t Count,
+            const std::vector<node> &dep = {});
+
+  // Adds a mem_advise node
+  node mem_advise(const void *Ptr, size_t Length, int Advice,
+                  const std::vector<node> &dep = {});
+  void mem_advise(node &Node, const void *Ptr, size_t Length, int Advice,
+                  const std::vector<node> &dep = {});
 
-    // Adding node for host task 
-    template<typename T>
-    node add_host_node(T hostTaskCallable, const std::vector<node>& dep = {});
+  // Adds a prefetch node
+  node prefetch(const void *Ptr, size_t Count,
+                const std::vector<node> &dep = {});
+  void prefetch(node &Node, const void *Ptr, size_t Count,
+                const std::vector<node> &dep = {});
 
-    // Adding device node:
-    template<typename T>
-    node add_device_node(T cgf, const std::vector<node>& dep = {});
+  // Adds a single_task node
+  template <typename KernelName = sycl::detail::auto_name, typename KernelType>
+  node single_task(const KernelType &(KernelFunc),
+                   const std::vector<node> &dep = {});
+  template <typename KernelName = sycl::detail::auto_name, typename KernelType>
+  void single_task(node &Node, const KernelType &(KernelFunc),
+                   const std::vector<node> &dep = {});
 
-    // Adding dependency between two nodes.
-    void make_edge(node sender, node receiver);
+  // Adds a parallel_for node
+  template <typename KernelName = sycl::detail::auto_name, typename KernelType>
+  node parallel_for(range<1> NumWorkItems, const KernelType &(KernelFunc),
+                    const std::vector<node> &dep = {});
+  template <typename KernelName = sycl::detail::auto_name, typename KernelType>
+  void parallel_for(node &Node, range<1> NumWorkItems,
+                    const KernelType &(KernelFunc),
+                    const std::vector<node> &dep = {});
 
-    // TODO: Extend queue to directly submit graph
-    void exec_and_wait( sycl::queue q );
+  template <typename KernelName = sycl::detail::auto_name, typename KernelType>
+  node parallel_for(range<2> NumWorkItems, const KernelType &(KernelFunc),
+                    const std::vector<node> &dep = {});
+  template <typename KernelName = sycl::detail::auto_name, typename KernelType>
+  void parallel_for(node &Node, range<2> NumWorkItems,
+                    const KernelType &(KernelFunc),
+                    const std::vector<node> &dep = {});
 
-    executable_graph exec( sycl::queue q ) { return executable_graph{my_graph,q};};
+  template <typename KernelName = sycl::detail::auto_name, typename KernelType>
+  node parallel_for(range<3> NumWorkItems, const KernelType &(KernelFunc),
+                    const std::vector<node> &dep = {});
+  template <typename KernelName = sycl::detail::auto_name, typename KernelType>
+  void parallel_for(node &Node, range<3> NumWorkItems,
+                    const KernelType &(KernelFunc),
+                    const std::vector<node> &dep = {});
 
-    graph() : my_graph(new detail::graph_impl()) {}
+  template <typename KernelName = sycl::detail::auto_name, typename KernelType,
+            int Dims>
+  node parallel_for(range<Dims> NumWorkItems, const KernelType &(KernelFunc),
+                    const std::vector<node> &dep = {});
+  template <typename KernelName = sycl::detail::auto_name, typename KernelType,
+            int Dims>
+  void parallel_for(node &Node, range<Dims> NumWorkItems,
+                    const KernelType &(KernelFunc),
+                    const std::vector<node> &dep = {});
 
-    // Creating a subgraph (with predecessors)
-    graph(graph& parent, const std::vector<node>& dep = {}) {}
+  template <typename KernelName = sycl::detail::auto_name, typename KernelType,
+            int Dims>
+  node parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
+                    const KernelType &(KernelFunc),
+                    const std::vector<node> &dep = {});
+  template <typename KernelName = sycl::detail::auto_name, typename KernelType,
+            int Dims>
+  void parallel_for(node &Node, range<Dims> NumWorkItems,
+                    id<Dims> WorkItemOffset, const KernelType &(KernelFunc),
+                    const std::vector<node> &dep = {});
 
-    bool is_subgraph();
+  template <typename KernelName = sycl::detail::auto_name, typename KernelType,
+            int Dims>
+  node parallel_for(nd_range<Dims> ExecutionRange,
+                    const KernelType &(KernelFunc),
+                    const std::vector<node> &dep = {});
+  template <typename KernelName = sycl::detail::auto_name, typename KernelType,
+            int Dims>
+  void parallel_for(node &Node, nd_range<Dims> ExecutionRange,
+                    const KernelType &(KernelFunc),
+                    const std::vector<node> &dep = {});
+
+  template <typename KernelName = sycl::detail::auto_name, typename KernelType,
+            int Dims, typename Reduction>
+  node parallel_for(range<Dims> NumWorkItems, Reduction Redu,
+                    const KernelType &(KernelFunc),
+                    const std::vector<node> &dep = {});
+  template <typename KernelName = sycl::detail::auto_name, typename KernelType,
+            int Dims, typename Reduction>
+  void parallel_for(node &Node, range<Dims> NumWorkItems, Reduction Redu,
+                    const KernelType &(KernelFunc),
+                    const std::vector<node> &dep = {});
+
+  template <typename KernelName = sycl::detail::auto_name, typename KernelType,
+            int Dims, typename Reduction>
+  node parallel_for(nd_range<Dims> ExecutionRange, Reduction Redu,
+                    const KernelType &(KernelFunc),
+                    const std::vector<node> &dep = {});
+  template <typename KernelName = sycl::detail::auto_name, typename KernelType,
+            int Dims, typename Reduction>
+  void parallel_for(node &Node, nd_range<Dims> ExecutionRange, Reduction Redu,
+                    const KernelType &(KernelFunc),
+                    const std::vector<node> &dep = {});
+
+  // Adds a dependency between two nodes.
+  void make_edge(node sender, node receiver);
+
+  // TODO: Extend queue to directly submit graph
+  void exec_and_wait(sycl::queue q);
+
+  executable_graph instantiate(sycl::queue q) {
+    return executable_graph{my_graph, q};
+  };
+
+  graph() : my_graph(new detail::graph_impl()) {}
+
+  // Creates a subgraph (with predecessors)
+  graph(graph &parent, const std::vector<node> &dep = {}) {}
+
+  bool is_subgraph();
 
 private:
-    detail::graph_ptr my_graph;
+  detail::graph_ptr my_graph;
 };
 
 void executable_graph::exec_and_wait() { my_queue.wait(); }
 
-template<typename T>
-node graph::add_device_node(T cgf , const std::vector<node>& dep) {
-    node _node(my_graph,cgf);
-    if( !dep.empty() ) {
-        for(auto n : dep) this->make_edge(n,_node);
-    } else {
-        _node.set_root();
-    }
-    return _node;
+/// Adds a node to the graph, in order to be executed upon graph execution.
+///
+/// \param cgf is a function object containing command group.
+/// \param dep is a vector of graph nodes the to be added node depends on.
+/// \return a graph node representing the command group operation.
+template <typename T>
+node graph::add_node(T cgf, const std::vector<node> &dep) {
+  node _node(my_graph, cgf);
+  if (!dep.empty()) {
+    for (auto n : dep)
+      this->make_edge(n, _node);
+  } else {
+    _node.set_root();
+  }
+  return _node;
+}
+
+/// Adds an empty node to the graph, in order to be executed upon graph
+/// execution.
+///
+/// \param dep is a vector of graph nodes the to be added node depends on.
+/// \return a graph node representing no operations but potentially node
+/// dependencies.
+node graph::add_node(const std::vector<node> &dep) {
+  node _node(my_graph);
+  if (!dep.empty()) {
+    for (auto n : dep)
+      this->make_edge(n, _node);
+  } else {
+    _node.set_root();
+  }
+  return _node;
+}
+
+/// Adds a node to the graph, in order to be executed upon graph execution.
+///
+/// \param Node is the graph node to be used. This overwrites the node
+/// parameters.
+/// \param dep is a vector of graph nodes the to be added node depends on.
+void graph::add_node(node &Node, const std::vector<node> &dep) {
+  Node.my_graph = this->my_graph;
+  Node.my_node->my_graph = this->my_graph;
+  Node.my_node->is_empty = false;
+  if (!dep.empty()) {
+    for (auto n : dep)
+      this->make_edge(n, Node);
+  } else {
+    Node.set_root();
+  }
+}
+
+/// Adds a node to the graph, in order to be executed upon graph execution.
+///
+/// \param Node is the graph node to be used. This overwrites the node
+/// parameters.
+/// \param cgf is a function object containing command group.
+/// \param dep is a vector of graph nodes the to be added node depends on.
+template <typename T>
+void graph::add_node(node &Node, T cgf, const std::vector<node> &dep) {
+  Node.my_graph = this->my_graph;
+  Node.my_node->my_graph = this->my_graph;
+  Node.my_node->my_body = cgf;
+  Node.my_node->is_empty = false;
+  if (!dep.empty()) {
+    for (auto n : dep)
+      this->make_edge(n, Node);
+  } else {
+    Node.set_root();
+  }
+}
+
+/// Sets or updates a graph node by overwriting its dependencies.
+///
+/// \param Node is a graph node to be updated.
+/// \param dep is a vector of graph nodes the to be updated node depends on.
+void graph::update_node(node &Node, const std::vector<node> &dep) {
+  Node.my_graph = this->my_graph;
+  Node.my_node->my_graph = this->my_graph;
+  Node.my_node->is_empty = true;
+  if (!dep.empty()) {
+    for (auto n : dep)
+      this->make_edge(n, Node);
+  } else {
+    Node.set_root();
+  }
+}
+
+/// Sets or updates a graph node by overwriting its parameters.
+///
+/// \param Node is a graph node to be updated.
+/// \param cgf is a function object containing command group.
+/// \param dep is a vector of graph nodes the to be updated node depends on.
+template <typename T>
+void graph::update_node(node &Node, T cgf, const std::vector<node> &dep) {
+  Node.my_graph = this->my_graph;
+  Node.my_node->my_graph = this->my_graph;
+  Node.my_node->my_body = cgf;
+  Node.my_node->is_empty = false;
+  if (!dep.empty()) {
+    for (auto n : dep)
+      this->make_edge(n, Node);
+  } else {
+    Node.set_root();
+  }
+}
+
+/// Fills the specified memory with the specified pattern.
+///
+/// \param Ptr is the pointer to the memory to fill.
+/// \param Pattern is the pattern to fill into the memory.  T should be
+/// trivially copyable.
+/// \param Count is the number of times to fill Pattern into Ptr.
+/// \param dep is a vector of graph nodes the fill depends on.
+/// \return a graph node representing the fill operation.
+template <typename T>
+node graph::fill(void *Ptr, const T &Pattern, size_t Count,
+                 const std::vector<node> &dep) {
+  return graph::add_node([=](sycl::handler &h) { h.fill(Ptr, Pattern, Count); },
+                         dep);
+}
+
+/// Fills the specified memory with the specified pattern.
+///
+/// \param Node is the graph node to be used for the fill. This overwrites
+/// the node parameters.
+/// \param Ptr is the pointer to the memory to fill.
+/// \param Pattern is the pattern to fill into the memory.  T should be
+/// trivially copyable.
+/// \param Count is the number of times to fill Pattern into Ptr.
+/// \param dep is a vector of graph nodes the fill depends on.
+template <typename T>
+void graph::fill(node &Node, void *Ptr, const T &Pattern, size_t Count,
+                 const std::vector<node> &dep) {
+  graph::update_node(
+      Node, [=](sycl::handler &h) { h.fill(Ptr, Pattern, Count); }, dep);
+}
+
+/// Copies data from one memory region to another, both pointed by
+/// USM pointers.
+/// No operations is done if \param Count is zero. An exception is thrown
+/// if either \param Dest or \param Src is nullptr. The behavior is undefined
+/// if any of the pointer parameters is invalid.
+///
+/// \param Dest is a USM pointer to the destination memory.
+/// \param Src is a USM pointer to the source memory.
+/// \param dep is a vector of graph nodes the memset depends on.
+/// \return a graph node representing the memset operation.
+node graph::memset(void *Ptr, int Value, size_t Count,
+                   const std::vector<node> &dep) {
+  return graph::add_node([=](sycl::handler &h) { h.memset(Ptr, Value, Count); },
+                         dep);
+}
+
+/// Copies data from one memory region to another, both pointed by
+/// USM pointers.
+/// No operations is done if \param Count is zero. An exception is thrown
+/// if either \param Dest or \param Src is nullptr. The behavior is undefined
+/// if any of the pointer parameters is invalid.
+///
+/// \param Node is the graph node to be used for the memset. This overwrites
+/// the node parameters.
+/// \param Dest is a USM pointer to the destination memory.
+/// \param Src is a USM pointer to the source memory.
+/// \param dep is a vector of graph nodes the memset depends on.
+void graph::memset(node &Node, void *Ptr, int Value, size_t Count,
+                   const std::vector<node> &dep) {
+  graph::update_node(
+      Node, [=](sycl::handler &h) { h.memset(Ptr, Value, Count); }, dep);
+}
+
+/// Copies data from one memory region to another, both pointed by
+/// USM pointers.
+/// No operations is done if \param Count is zero. An exception is thrown
+/// if either \param Dest or \param Src is nullptr. The behavior is undefined
+/// if any of the pointer parameters is invalid.
+///
+/// \param Dest is a USM pointer to the destination memory.
+/// \param Src is a USM pointer to the source memory.
+/// \param Count is a number of bytes to copy.
+/// \param dep is a vector of graph nodes the memcpy depends on.
+/// \return a graph node representing the memcpy operation.
+node graph::memcpy(void *Dest, const void *Src, size_t Count,
+                   const std::vector<node> &dep) {
+  return graph::add_node([=](sycl::handler &h) { h.memcpy(Dest, Src, Count); },
+                         dep);
+}
+
+/// Copies data from one memory region to another, both pointed by
+/// USM pointers.
+/// No operations is done if \param Count is zero. An exception is thrown
+/// if either \param Dest or \param Src is nullptr. The behavior is undefined
+/// if any of the pointer parameters is invalid.
+///
+/// \param Node is the graph node to be used for the memcpy. This overwrites
+/// the node parameters.
+/// \param Dest is a USM pointer to the destination memory.
+/// \param Src is a USM pointer to the source memory.
+/// \param Count is a number of bytes to copy.
+/// \param dep is a vector of graph nodes the memcpy depends on.
+void graph::memcpy(node &Node, void *Dest, const void *Src, size_t Count,
+                   const std::vector<node> &dep) {
+  graph::update_node(
+      Node, [=](sycl::handler &h) { h.memcpy(Dest, Src, Count); }, dep);
+}
+
+/// Copies data from one memory region to another, both pointed by
+/// USM pointers.
+/// No operations is done if \param Count is zero. An exception is thrown
+/// if either \param Dest or \param Src is nullptr. The behavior is undefined
+/// if any of the pointer parameters is invalid.
+///
+/// \param Src is a USM pointer to the source memory.
+/// \param Dest is a USM pointer to the destination memory.
+/// \param Count is a number of elements of type T to copy.
+/// \param dep is a vector of graph nodes the copy depends on.
+/// \return a graph node representing the copy operation.
+template <typename T>
+node graph::copy(const T *Src, T *Dest, size_t Count,
+                 const std::vector<node> &dep) {
+  return graph::add_node(
+      [=](sycl::handler &h) { h.memcpy(Dest, Src, Count * sizeof(T)); }, dep);
+}
+
+/// Copies data from one memory region to another, both pointed by
+/// USM pointers.
+/// No operations is done if \param Count is zero. An exception is thrown
+/// if either \param Dest or \param Src is nullptr. The behavior is undefined
+/// if any of the pointer parameters is invalid.
+///
+/// \param Node is the graph node to be used for the copy. This overwrites
+/// the node parameters.
+/// \param Src is a USM pointer to the source memory.
+/// \param Dest is a USM pointer to the destination memory.
+/// \param Count is a number of elements of type T to copy.
+/// \param dep is a vector of graph nodes the copy depends on.
+template <typename T>
+void graph::copy(node &Node, const T *Src, T *Dest, size_t Count,
+                 const std::vector<node> &dep) {
+  graph::update_node(
+      Node, [=](sycl::handler &h) { h.memcpy(Dest, Src, Count * sizeof(T)); },
+      dep);
+}
+
+/// Provides additional information to the underlying runtime about how
+/// different allocations are used.
+///
+/// \param Ptr is a USM pointer to the allocation.
+/// \param Length is a number of bytes in the allocation.
+/// \param Advice is a device-defined advice for the specified allocation.
+/// \param dep is a vector of graph nodes the mem_advise depends on.
+/// \return a graph node representing the mem_advise operation.
+node graph::mem_advise(const void *Ptr, size_t Length, int Advice,
+                       const std::vector<node> &dep) {
+  return graph::add_node(
+      [=](sycl::handler &h) { h.mem_advise(Ptr, Length, Advice); }, dep);
+}
+
+/// Provides additional information to the underlying runtime about how
+/// different allocations are used.
+///
+/// \param Node is the graph node to be used for the mem_advise. This overwrites
+/// the node parameters.
+/// \param Ptr is a USM pointer to the allocation.
+/// \param Length is a number of bytes in the allocation.
+/// \param Advice is a device-defined advice for the specified allocation.
+/// \param dep is a vector of graph nodes the mem_advise depends on.
+void graph::mem_advise(node &Node, const void *Ptr, size_t Length, int Advice,
+                       const std::vector<node> &dep) {
+  graph::update_node(
+      Node, [=](sycl::handler &h) { h.mem_advise(Ptr, Length, Advice); }, dep);
+}
+
+/// Provides hints to the runtime library that data should be made available
+/// on a device earlier than Unified Shared Memory would normally require it
+/// to be available.
+///
+/// \param Ptr is a USM pointer to the memory to be prefetched to the device.
+/// \param Count is a number of bytes to be prefetched.
+/// \param dep is a vector of graph nodes the prefetch depends on.
+/// \return a graph node representing the prefetch operation.
+node graph::prefetch(const void *Ptr, size_t Count,
+                     const std::vector<node> &dep) {
+  return graph::add_node([=](sycl::handler &h) { h.prefetch(Ptr, Count); },
+                         dep);
+}
+
+/// Provides hints to the runtime library that data should be made available
+/// on a device earlier than Unified Shared Memory would normally require it
+/// to be available.
+///
+/// \param Node is the graph node to be used for the prefetch. This overwrites
+/// the node parameters.
+/// \param Ptr is a USM pointer to the memory to be prefetched to the device.
+/// \param Count is a number of bytes to be prefetched.
+/// \param dep is a vector of graph nodes the prefetch depends on.
+void graph::prefetch(node &Node, const void *Ptr, size_t Count,
+                     const std::vector<node> &dep) {
+  graph::update_node(
+      Node, [=](sycl::handler &h) { h.prefetch(Ptr, Count); }, dep);
+}
+
+/// single_task version with a kernel represented as a lambda.
+///
+/// \param KernelFunc is the Kernel functor or lambda
+/// \param dep is a vector of graph nodes the single_task depends on.
+/// \return a graph node representing the single_task operation.
+template <typename KernelName, typename KernelType>
+node graph::single_task(const KernelType &(KernelFunc),
+                        const std::vector<node> &dep) {
+  return graph::add_node(
+      [=](sycl::handler &h) {
+        h.template single_task<KernelName, KernelType>(KernelFunc);
+      },
+      dep);
+}
+
+/// single_task version with a kernel represented as a lambda.
+///
+/// \param Node is the graph node to be used for the single_task. This
+/// overwrites the node parameters.
+/// \param KernelFunc is the Kernel functor or lambda
+/// \param dep is a vector of graph nodes the single_task depends on.
+template <typename KernelName, typename KernelType>
+void graph::single_task(node &Node, const KernelType &(KernelFunc),
+                        const std::vector<node> &dep) {
+  graph::update_node(
+      Node,
+      [=](sycl::handler &h) {
+        h.template single_task<KernelName, KernelType>(KernelFunc);
+      },
+      dep);
+}
+
+/// parallel_for version with a kernel represented as a lambda + range that
+/// specifies global size only.
+///
+/// \param NumWorkItems is a range that specifies the work space of the kernel
+/// \param KernelFunc is the Kernel functor or lambda
+/// \param dep is a vector of graph nodes the parallel_for depends on
+/// \return a graph node representing the parallel_for operation.
+template <typename KernelName, typename KernelType>
+node graph::parallel_for(range<1> NumWorkItems, const KernelType &(KernelFunc),
+                         const std::vector<node> &dep) {
+  return graph::add_node(
+      [=](sycl::handler &h) {
+        h.template parallel_for<KernelName, KernelType>(NumWorkItems,
+                                                        KernelFunc);
+      },
+      dep);
+}
+
+/// parallel_for version with a kernel represented as a lambda + range that
+/// specifies global size only.
+///
+/// \param Node is the graph node to be used for the single_task. This
+/// overwrites the node parameters.
+/// \param NumWorkItems is a range that specifies the work space of the kernel
+/// \param KernelFunc is the Kernel functor or lambda
+/// \param dep is a vector of graph nodes the parallel_for depends on
+template <typename KernelName, typename KernelType>
+void graph::parallel_for(node &Node, range<1> NumWorkItems,
+                         const KernelType &(KernelFunc),
+                         const std::vector<node> &dep) {
+  graph::update_node(
+      Node,
+      [=](sycl::handler &h) {
+        h.template parallel_for<KernelName, KernelType>(NumWorkItems,
+                                                        KernelFunc);
+      },
+      dep);
+}
+
+/// parallel_for version with a kernel represented as a lambda + range that
+/// specifies global size only.
+///
+/// \param NumWorkItems is a range that specifies the work space of the kernel
+/// \param KernelFunc is the Kernel functor or lambda
+/// \param dep is a vector of graph nodes the parallel_for depends on
+/// \return a graph node representing the parallel_for operation.
+template <typename KernelName, typename KernelType>
+node graph::parallel_for(range<2> NumWorkItems, const KernelType &(KernelFunc),
+                         const std::vector<node> &dep) {
+  return graph::add_node(
+      [=](sycl::handler &h) {
+        h.template parallel_for<KernelName, KernelType>(NumWorkItems,
+                                                        KernelFunc);
+      },
+      dep);
+}
+
+/// parallel_for version with a kernel represented as a lambda + range that
+/// specifies global size only.
+///
+/// \param Node is the graph node to be used for the single_task. This
+/// overwrites the node parameters.
+/// \param NumWorkItems is a range that specifies the work space of the kernel
+/// \param KernelFunc is the Kernel functor or lambda
+/// \param dep is a vector of graph nodes the parallel_for depends on
+template <typename KernelName, typename KernelType>
+void graph::parallel_for(node &Node, range<2> NumWorkItems,
+                         const KernelType &(KernelFunc),
+                         const std::vector<node> &dep) {
+  graph::update_node(
+      Node,
+      [=](sycl::handler &h) {
+        h.template parallel_for<KernelName, KernelType>(NumWorkItems,
+                                                        KernelFunc);
+      },
+      dep);
+}
+
+/// parallel_for version with a kernel represented as a lambda + range that
+/// specifies global size only.
+///
+/// \param NumWorkItems is a range that specifies the work space of the kernel
+/// \param KernelFunc is the Kernel functor or lambda
+/// \param dep is a vector of graph nodes the parallel_for depends on
+/// \return a graph node representing the parallel_for operation.
+template <typename KernelName, typename KernelType>
+node graph::parallel_for(range<3> NumWorkItems, const KernelType &(KernelFunc),
+                         const std::vector<node> &dep) {
+  return graph::add_node(
+      [=](sycl::handler &h) {
+        h.template parallel_for<KernelName, KernelType>(NumWorkItems,
+                                                        KernelFunc);
+      },
+      dep);
+}
+
+/// parallel_for version with a kernel represented as a lambda + range that
+/// specifies global size only.
+///
+/// \param Node is the graph node to be used for the single_task. This
+/// overwrites the node parameters.
+/// \param NumWorkItems is a range that specifies the work space of the kernel
+/// \param KernelFunc is the Kernel functor or lambda
+/// \param dep is a vector of graph nodes the parallel_for depends on
+template <typename KernelName, typename KernelType>
+void graph::parallel_for(node &Node, range<3> NumWorkItems,
+                         const KernelType &(KernelFunc),
+                         const std::vector<node> &dep) {
+  graph::update_node(
+      Node,
+      [=](sycl::handler &h) {
+        h.template parallel_for<KernelName, KernelType>(NumWorkItems,
+                                                        KernelFunc);
+      },
+      dep);
+}
+
+/// parallel_for version with a kernel represented as a lambda + range that
+/// specifies global size only.
+///
+/// \param NumWorkItems is a range that specifies the work space of the kernel
+/// \param KernelFunc is the Kernel functor or lambda
+/// \param dep is a vector of graph nodes the parallel_for depends on
+/// \return a graph node representing the parallel_for operation.
+template <typename KernelName, typename KernelType, int Dims>
+node graph::parallel_for(range<Dims> NumWorkItems,
+                         const KernelType &(KernelFunc),
+                         const std::vector<node> &dep) {
+  return graph::add_node(
+      [=](sycl::handler &h) {
+        h.template parallel_for<KernelName, KernelType>(NumWorkItems,
+                                                        KernelFunc);
+      },
+      dep);
+}
+
+/// parallel_for version with a kernel represented as a lambda + range that
+/// specifies global size only.
+///
+/// \param Node is the graph node to be used for the single_task. This
+/// overwrites the node parameters.
+/// \param NumWorkItems is a range that specifies the work space of the kernel
+/// \param KernelFunc is the Kernel functor or lambda
+/// \param dep is a vector of graph nodes the parallel_for depends on
+template <typename KernelName, typename KernelType, int Dims>
+void graph::parallel_for(node &Node, range<Dims> NumWorkItems,
+                         const KernelType &(KernelFunc),
+                         const std::vector<node> &dep) {
+  graph::update_node(
+      Node,
+      [=](sycl::handler &h) {
+        h.template parallel_for<KernelName, KernelType>(NumWorkItems,
+                                                        KernelFunc);
+      },
+      dep);
+}
+
+/// parallel_for version with a kernel represented as a lambda + range and
+/// offset that specify global size and global offset correspondingly.
+///
+/// \param NumWorkItems is a range that specifies the work space of the kernel
+/// \param WorkItemOffset specifies the offset for each work item id
+/// \param KernelFunc is the Kernel functor or lambda
+/// \param dep is a vector of graph nodes the parallel_for depends on
+/// \return a graph node representing the parallel_for operation.
+template <typename KernelName, typename KernelType, int Dims>
+node graph::parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
+                         const KernelType &(KernelFunc),
+                         const std::vector<node> &dep) {
+  return graph::add_node(
+      [=](sycl::handler &h) {
+        h.template parallel_for<KernelName, KernelType>(
+            NumWorkItems, WorkItemOffset, KernelFunc);
+      },
+      dep);
+}
+
+/// parallel_for version with a kernel represented as a lambda + range and
+/// offset that specify global size and global offset correspondingly.
+///
+/// \param Node is the graph node to be used for the single_task. This
+/// overwrites the node parameters.
+/// \param NumWorkItems is a range that specifies the work space of the kernel
+/// \param WorkItemOffset specifies the offset for each work item id
+/// \param KernelFunc is the Kernel functor or lambda
+/// \param dep is a vector of graph nodes the parallel_for depends on
+template <typename KernelName, typename KernelType, int Dims>
+void graph::parallel_for(node &Node, range<Dims> NumWorkItems,
+                         id<Dims> WorkItemOffset,
+                         const KernelType &(KernelFunc),
+                         const std::vector<node> &dep) {
+  graph::update_node(
+      Node,
+      [=](sycl::handler &h) {
+        h.template parallel_for<KernelName, KernelType>(
+            NumWorkItems, WorkItemOffset, KernelFunc);
+      },
+      dep);
+}
+
+/// parallel_for version with a kernel represented as a lambda + nd_range that
+/// specifies global, local sizes and offset.
+///
+/// \param ExecutionRange is a range that specifies the work space of the
+/// kernel
+/// \param KernelFunc is the Kernel functor or lambda
+/// \param dep is a vector of graph nodes the parallel_for depends on
+/// \return a graph node representing the parallel_for operation.
+template <typename KernelName, typename KernelType, int Dims>
+node graph::parallel_for(nd_range<Dims> ExecutionRange,
+                         const KernelType &(KernelFunc),
+                         const std::vector<node> &dep) {
+  return graph::add_node(
+      [=](sycl::handler &h) {
+        h.template parallel_for<KernelName, KernelType>(ExecutionRange,
+                                                        KernelFunc);
+      },
+      dep);
+}
+
+/// parallel_for version with a kernel represented as a lambda + nd_range that
+/// specifies global, local sizes and offset.
+///
+/// \param Node is the graph node to be used for the single_task. This
+/// overwrites the node parameters.
+/// \param ExecutionRange is a range that specifies the work space of the
+/// kernel
+/// \param KernelFunc is the Kernel functor or lambda
+/// \param dep is a vector of graph nodes the parallel_for depends on
+template <typename KernelName, typename KernelType, int Dims>
+void graph::parallel_for(node &Node, nd_range<Dims> ExecutionRange,
+                         const KernelType &(KernelFunc),
+                         const std::vector<node> &dep) {
+  graph::update_node(
+      Node,
+      [=](sycl::handler &h) {
+        h.template parallel_for<KernelName, KernelType>(ExecutionRange,
+                                                        KernelFunc);
+      },
+      dep);
+}
+
+/// parallel_for version with a kernel represented as a lambda + range that
+/// specifies global, local sizes and offset.
+///
+/// \param NumWorkItems is a range that specifies the work space of the kernel
+/// \param Redu is a reduction operation
+/// \param KernelFunc is the Kernel functor or lambda
+/// \param dep is a vector of graph nodes the parallel_for depends on
+/// \return a graph node representing the parallel_for operation.
+template <typename KernelName, typename KernelType, int Dims,
+          typename Reduction>
+node graph::parallel_for(range<Dims> NumWorkItems, Reduction Redu,
+                         const KernelType &(KernelFunc),
+                         const std::vector<node> &dep) {
+  return graph::add_node(
+      [=](sycl::handler &h) {
+        h.template parallel_for<KernelName, KernelType, Dims, Reduction>(
+            NumWorkItems, Redu, KernelFunc);
+      },
+      dep);
+}
+
+/// parallel_for version with a kernel represented as a lambda + range that
+/// specifies global, local sizes and offset.
+///
+/// \param Node is the graph node to be used for the single_task. This
+/// overwrites the node parameters.
+/// \param NumWorkItems is a range that specifies the work space of the kernel
+/// \param Redu is a reduction operation
+/// \param KernelFunc is the Kernel functor or lambda
+/// \param dep is a vector of graph nodes the parallel_for depends on
+template <typename KernelName, typename KernelType, int Dims,
+          typename Reduction>
+void graph::parallel_for(node &Node, range<Dims> NumWorkItems, Reduction Redu,
+                         const KernelType &(KernelFunc),
+                         const std::vector<node> &dep) {
+  graph::update_node(
+      Node,
+      [=](sycl::handler &h) {
+        h.template parallel_for<KernelName, KernelType, Dims, Reduction>(
+            NumWorkItems, Redu, KernelFunc);
+      },
+      dep);
+}
+
+/// parallel_for version with a kernel represented as a lambda + nd_range that
+/// specifies global, local sizes and offset.
+///
+/// \param ExecutionRange is a range that specifies the work space of the
+/// kernel
+/// \param Redu is a reduction operation
+/// \param KernelFunc is the Kernel functor or lambda
+/// \param dep is a vector of graph nodes the parallel_for depends on
+/// \return a graph node representing the parallel_for operation.
+template <typename KernelName, typename KernelType, int Dims,
+          typename Reduction>
+node graph::parallel_for(nd_range<Dims> ExecutionRange, Reduction Redu,
+                         const KernelType &(KernelFunc),
+                         const std::vector<node> &dep) {
+  return graph::add_node(
+      [=](sycl::handler &h) {
+        h.template parallel_for<KernelName, KernelType, Dims, Reduction>(
+            ExecutionRange, Redu, KernelFunc);
+      },
+      dep);
+}
+
+/// parallel_for version with a kernel represented as a lambda + nd_range that
+/// specifies global, local sizes and offset.
+///
+/// \param Node is the graph node to be used for the single_task. This
+/// overwrites the node parameters.
+/// \param ExecutionRange is a range that specifies the work space of the
+/// kernel
+/// \param Redu is a reduction operation
+/// \param KernelFunc is the Kernel functor or lambda
+/// \param dep is a vector of graph nodes the parallel_for depends on
+template <typename KernelName, typename KernelType, int Dims,
+          typename Reduction>
+void graph::parallel_for(node &Node, nd_range<Dims> ExecutionRange,
+                         Reduction Redu, const KernelType &(KernelFunc),
+                         const std::vector<node> &dep) {
+  graph::update_node(
+      Node,
+      [=](sycl::handler &h) {
+        h.template parallel_for<KernelName, KernelType, Dims, Reduction>(
+            ExecutionRange, Redu, KernelFunc);
+      },
+      dep);
 }
 
 void graph::make_edge(node sender, node receiver) {
-    sender.register_successor(receiver);//register successor
-    my_graph->remove_root(receiver.my_node); //remove receiver from root node list
+  sender.register_successor(receiver);     // register successor
+  my_graph->remove_root(receiver.my_node); // remove receiver from root node
+                                           // list
 }
 
-void graph::exec_and_wait( sycl::queue q ) {
-    my_graph->exec_and_wait(q);
-};
+void graph::exec_and_wait(sycl::queue q) { my_graph->exec_and_wait(q); };
 
 } // namespace experimental
 } // namespace oneapi
 } // namespace ext
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
-

From 47251a2c442c1ef55cfc1399a74e868583830ce7 Mon Sep 17 00:00:00 2001
From: u154255 <u154255@s001-n162.aidevcloud>
Date: Wed, 15 Jun 2022 08:15:15 -0700
Subject: [PATCH 17/43] added makefile and run.sh

---
 sycl/examples/Makefile | 183 +++++++++++++++++++++++++++++++++++++++++
 sycl/examples/run.sh   |  40 +++++++++
 2 files changed, 223 insertions(+)
 create mode 100644 sycl/examples/Makefile
 create mode 100755 sycl/examples/run.sh

diff --git a/sycl/examples/Makefile b/sycl/examples/Makefile
new file mode 100644
index 0000000000000..2aab1c08f2e8e
--- /dev/null
+++ b/sycl/examples/Makefile
@@ -0,0 +1,183 @@
+# SYCL compiler
+
+# Set to the path of the Intel LLVM build with the SYCL Graph extension.
+#LLVMBUILDDIR = ${DPCPP_HOME}/llvm/build
+#export PATH := ${LLVMBUILDDIR}/bin:${PATH}
+#export LD_LIBRARY_PATH := ${LLVMBUILDDIR}/lib:${LD_LIBRARY_PATH}
+
+CXX = clang++
+#CPPFLAGS = -fsycl -std=c++17 -O3 -fsycl-device-code-split=off -fsycl-early-optimizations -fsycl-dead-args-optimization -Wall -Wpedantic -v
+CPPFLAGS = -fsycl -std=c++17 -g -fsycl-unnamed-lambda
+CPPFLAGSGRAPH = -D SYCL_EXT_ONEAPI_LAZY_QUEUE=1
+CPPINCLUDEPATH = -I${DPCPP_HOME}/llvm/sycl/include -I${DPCPP_HOME}/llvm/include
+# Enables AOT compilation. Specify the target device. Disable AOT and use JIT by setting DEVICE to an empty string.
+CPPFLAGSAOT = -fsycl-targets=spir64_gen -Xs "-device Gen9"  # Intel Gen9 GPU
+# CPPFLAGSAOT = -fsycl-targets=nvptx64-nvidia-cuda  # NVIDIA GPU
+
+# Set the L0 command batch size to 20. This is a workaround with the current PoC since the dynamic adjusting of the command batch size does not work with the lazy queue.
+#ENVGRAPH = SYCL_PI_LEVEL_ZERO_BATCH_SIZE=20
+
+
+# CUDA compiler
+NVCC = nvcc
+# Adjust for the target architecture
+NVFLAGS = -O3 -lineinfo -arch=sm_75
+
+
+SRCS = $(wildcard *.cpp) $(wildcard *.hpp) $(wildcard *.cu)
+
+#1DHEAT-OBJECTS = 1Dheat-sycl-buffer 1Dheat-sycl-usm 1Dheat-sycl-buffer-graph 1Dheat-sycl-usm-graph 1Dheat-sycl-usm-graph-capture 1Dheat-sycl-usm-graph-unroll
+
+#ACCESSORS-OBJECTS = accessors-sycl-buffer accessors-sycl-usm accessors-sycl-buffer-graph accessors-sycl-usm-graph
+
+#APSP-OBJECTS = apsp-sycl-buffer apsp-sycl-usm apsp-sycl-buffer-graph apsp-sycl-usm-graph apsp-sycl-usm-graph-capture apsp-sycl-usm-graph-unroll
+
+#DOTP-OBJECTS = dotp-sycl-buffer dotp-sycl-usm dotp-sycl-buffer-graph dotp-sycl-usm-graph dotp-sycl-usm-graph-capture
+
+#ISO2DFD-OBJECTS = iso2dfd-sycl-buffer iso2dfd-sycl-usm iso2dfd-sycl-buffer-graph iso2dfd-sycl-usm-graph iso2dfd-sycl-usm-graph-capture iso2dfd-sycl-usm-graph-unroll
+
+#NN-OBJECTS = nn-sycl-usm nn-sycl-usm-graph nn-sycl-usm-graph-unroll nn-cuda nn-cuda-graphs nn-cuda-graphs-explicit
+
+#SYCL-USM-OBJECTS = 1Dheat-sycl-usm accessors-sycl-usm apsp-sycl-usm dotp-sycl-usm iso2dfd-sycl-usm nn-sycl-usm
+
+#SYCL-BUFFER-OBJECTS = 1Dheat-sycl-buffer accessors-sycl-buffer apsp-sycl-buffer dotp-sycl-buffer iso2dfd-sycl-buffer
+
+#SYCL-USM-GRAPH-OBJECTS = 1Dheat-sycl-usm-graph 1Dheat-sycl-usm-graph-capture 1Dheat-sycl-usm-graph-unroll accessors-sycl-usm-graph apsp-sycl-usm-graph apsp-sycl-usm-graph-capture apsp-sycl-usm-graph-unroll dotp-sycl-usm-graph dotp-sycl-usm-graph-capture iso2dfd-sycl-usm-graph iso2dfd-sycl-usm-graph-capture iso2dfd-sycl-usm-graph-unroll nn-sycl-usm-graph nn-sycl-usm-graph-unroll
+#SYCL-USM-GRAPH-OBJECTS = 1Dheat-sycl-usm-graph 1Dheat-sycl-usm-graph-capture iso2dfd-sycl-usm-graph accessors-sycl-usm-graph apsp-sycl-usm-graph dotp-sycl-usm-graph nn-sycl-usm-graph
+SYCL-USM-GRAPH-OBJECTS = 1Dheat-sycl-usm-graph
+
+#SYCL-BUFFER-GRAPH-OBJECTS = 1Dheat-sycl-buffer-graph accessors-sycl-buffer-graph apsp-sycl-buffer-graph dotp-sycl-buffer-graph iso2dfd-sycl-buffer-graph
+
+.PHONY: all 1Dheat accessors apsp dotp iso2dfd nn sycl-graph sycl-usm-graph sycl-buffer-graph run run-1Dheat run-accessors run-apsp run-dotp run-iso2dfd run-nn run-sycl-usm-graph run-sycl-buffer-graph format tidy clean
+
+#all: 1Dheat accessors apsp dotp iso2dfd nn
+#1Dheat: $(1DHEAT-OBJECTS)
+#accessors: $(ACCESSORS-OBJECTS)
+#apsp: $(APSP-OBJECTS)
+#dotp: $(DOTP-OBJECTS)
+#iso2dfd: $(ISO2DFD-OBJECTS)
+#nn: $(NN-OBJECTS)
+#
+#sycl: sycl-usm sycl-buffer
+#sycl-usm: $(SYCL-USM-OBJECTS)
+#sycl-buffer: $(SYCL-BUFFER-OBJECTS)
+#
+#sycl-graph: sycl-usm-graph sycl-buffer-graph
+sycl-usm-graph: $(SYCL-USM-GRAPH-OBJECTS)
+#sycl-buffer-graph: $(SYCL-BUFFER-GRAPH-OBJECTS)
+#
+#run: run-1Dheat run-accessors run-apsp run-dotp run-iso2dfd run-nn
+#run-1Dheat: $(addprefix run-,$(1DHEAT-OBJECTS))
+#run-accessors: $(addprefix run-,$(ACCESSORS-OBJECTS))
+#run-apsp: $(addprefix run-,$(APSP-OBJECTS))
+#run-dotp: $(addprefix run-,$(DOTP-OBJECTS))
+#run-iso2dfd: $(addprefix run-,$(ISO2DFD-OBJECTS))
+#run-nn: $(addprefix run-,$(NN-OBJECTS))
+#
+#run-sycl: run-sycl-usm run-sycl-buffer
+#run-sycl-usm: $(addprefix run-,$(SYCL-USM-OBJECTS))
+#run-sycl-buffer: $(addprefix run-,$(SYCL-BUFFER-OBJECTS))
+#
+#run-sycl-graph: run-sycl-usm-graph run-sycl-buffer-graph
+run-sycl-usm-graph: $(addprefix run-,$(SYCL-USM-GRAPH-OBJECTS))
+#run-sycl-buffer-graph: $(addprefix run-,$(SYCL-BUFFER-GRAPH-OBJECTS))
+#
+#%-sycl-buffer: %-sycl-buffer.cpp
+#	$(CXX) $(CPPFLAGS) $(CPPFLAGSAOT) $< -o $@
+#
+#%-sycl-usm: %-sycl-usm.cpp
+#	$(CXX) $(CPPFLAGS) $(CPPFLAGSAOT) $< -o $@
+#
+%-sycl-usm-graph: %-sycl-usm-graph.cpp
+	$(CXX) $(CPPFLAGS) $(CPPINCLUDEPATH) $< -o $@
+
+%-sycl-usm-graph-capture: %-sycl-usm-graph-capture.cpp
+	$(CXX) $(CPPFLAGS) $(CPPINCLUDEPATH) $< -o $@
+#
+#%-sycl-usm-graph-unroll: %-sycl-usm-graph-unroll.cpp
+#	$(CXX) $(CPPFLAGS) $(CPPFLAGSAOT) $(CPPFLAGSGRAPH) $< -o $@
+#
+#run-1Dheat-sycl-buffer: 1Dheat-sycl-buffer
+#	./$< 10000 1000
+#
+#run-1Dheat-sycl-usm: 1Dheat-sycl-usm
+#	./$< 10000 1000
+#
+#run-1Dheat-sycl-buffer-graph: 1Dheat-sycl-buffer-graph
+#	$(ENVGRAPH) ./$< 10000 1000
+#
+run-1Dheat-sycl-usm-graph: 1Dheat-sycl-usm-graph
+	./$< 100 10
+#	$(ENVGRAPH) ./$< 10000 1000
+
+run-1Dheat-sycl-usm-graph-capture: 1Dheat-sycl-usm-graph-capture
+	$(ENVGRAPH) ./$< 1000 1
+
+
+#run-1Dheat-sycl-usm-graph-unroll: 1Dheat-sycl-usm-graph-unroll
+#	$(ENVGRAPH) ./$< 10000 1000
+#
+#run-iso2dfd-sycl-buffer: iso2dfd-sycl-buffer
+#	./$< 1024 1024 2000
+#
+#run-iso2dfd-sycl-usm: iso2dfd-sycl-usm
+#	./$< 1024 1024 2000
+#
+#run-iso2dfd-sycl-buffer-graph: iso2dfd-sycl-buffer-graph
+#	$(ENVGRAPH) ./$< 1024 1024 2000
+#
+run-iso2dfd-sycl-usm-graph: iso2dfd-sycl-usm-graph
+	$(ENVGRAPH) ./$< 16 16 20
+#	$(ENVGRAPH) ./$< 1024 1024 2000
+#
+#run-iso2dfd-sycl-usm-graph-capture: iso2dfd-sycl-usm-graph-capture
+#	$(ENVGRAPH) ./$< 1024 1024 2000
+#
+#run-iso2dfd-sycl-usm-graph-unroll: iso2dfd-sycl-usm-graph-unroll
+#	$(ENVGRAPH) ./$< 1024 1024 2000
+#
+#run-%-sycl-buffer: %-sycl-buffer
+#	./$<
+#
+#run-%-sycl-usm: %-sycl-usm
+#	./$<
+#
+#run-%-sycl-buffer-graph: %-sycl-buffer-graph
+#	$(ENVGRAPH) ./$<
+#
+run-%-sycl-usm-graph: %-sycl-usm-graph
+	$(ENVGRAPH) ./$<
+#
+#run-%-sycl-usm-graph-capture: %-sycl-usm-graph-capture
+#	$(ENVGRAPH) ./$<
+#
+#run-%-sycl-usm-graph-unroll: %-sycl-usm-graph-unroll
+#	$(ENVGRAPH) ./$<
+#
+#%-cuda: %-cuda.cu
+#	$(NVCC) $(NVFLAGS) $< -o $@
+#
+#%-cuda-graphs: %-cuda-graphs.cu
+#	$(NVCC) $(NVFLAGS) $< -o $@
+#
+#%-cuda-graphs-capture: %-cuda-graphs-capture.cu
+#	$(NVCC) $(NVFLAGS) $< -o $@
+#
+#run-%-cuda: %-cuda
+#	./$<
+#
+#run-%-cuda-graphs: %-cuda-graphs
+#	./$<
+#
+#run-%-cuda-graphs-capture: %-cuda-graphs-capture
+#	./$<
+
+format:
+	clang-format -i $(SRCS)
+
+tidy:
+	clang-tidy -header-filter=.* $(SRCS) -- -I${LLVMBUILDDIR}/include -I${LLVMBUILDDIR}/include/sycl
+
+clean:
+	rm -f $(1DHEAT-OBJECTS) $(ACCESSORS-OBJECTS) $(APSP-OBJECTS) $(DOTP-OBJECTS) $(ISO2DFD-OBJECTS) $(NN-OBJECTS) wavefield_snapshot.bin wavefield_snapshot_cpu.bin *_error_diff.txt
+
diff --git a/sycl/examples/run.sh b/sycl/examples/run.sh
new file mode 100755
index 0000000000000..814078cddc2a4
--- /dev/null
+++ b/sycl/examples/run.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+#source /opt/intel/inteloneapi/setvars.sh > /dev/null 2>&1
+
+export DPCPP_HOME=/home/u154255/sycl_graph/chchiu
+export LLVMBUILDDIR=$DPCPP_HOME/llvm/build 
+
+export PATH=$LLVMBUILDDIR/bin:$PATH
+export LD_LIBRARY_PATH=$LLVMBUILDDIR/lib:$LD_LIBRARY_PATH
+
+#fullname=$1
+#filename="${fullname%%.*}"
+#
+#clang++ -g $1 -o $filename -I$DPCPP_HOME/sycl/include -I$DPCPP_HOME/include -std=c++17 -fsycl -fsycl-unnamed-lambda 
+#export LD_LIBRARY_PATH=$LLVMBUILDDIR/lib:$LD_LIBRARY_PATH
+#./$filename 1 1
+#
+
+# clean all the executable and output texts
+make clean
+
+# compile *-sycl-usm-graph source code
+make sycl-usm-graph
+
+# execute *-sycl-usm-graph executables
+echo ""
+echo "------ Start  running executables ------"
+echo ""
+make run-sycl-usm-graph
+echo ""
+echo "------ Finish running executables ------"
+echo ""
+
+
+
+
+#export LD_LIBRARY_PATH=$LLVMBUILDDIR/lib:$LD_LIBRARY_PATH
+#rm -rf ./a.out
+#PATH=$LLVMBUILDDIR/bin:$PATH clang++ -g $1 -I $LLVMBUILDDIR/include/sycl -I $LLVMBUILDDIR/include -std=c++17 -fsycl -fsycl-unnamed-lambda 
+
+

From aa03781a2a2393215cdf9767b7e5ef03efab6f97 Mon Sep 17 00:00:00 2001
From: u154255 <u154255@s001-n162.aidevcloud>
Date: Wed, 15 Jun 2022 08:45:42 -0700
Subject: [PATCH 18/43] updated makefile

---
 sycl/examples/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/examples/Makefile b/sycl/examples/Makefile
index 2aab1c08f2e8e..0dda564533baf 100644
--- a/sycl/examples/Makefile
+++ b/sycl/examples/Makefile
@@ -44,7 +44,7 @@ SRCS = $(wildcard *.cpp) $(wildcard *.hpp) $(wildcard *.cu)
 
 #SYCL-USM-GRAPH-OBJECTS = 1Dheat-sycl-usm-graph 1Dheat-sycl-usm-graph-capture 1Dheat-sycl-usm-graph-unroll accessors-sycl-usm-graph apsp-sycl-usm-graph apsp-sycl-usm-graph-capture apsp-sycl-usm-graph-unroll dotp-sycl-usm-graph dotp-sycl-usm-graph-capture iso2dfd-sycl-usm-graph iso2dfd-sycl-usm-graph-capture iso2dfd-sycl-usm-graph-unroll nn-sycl-usm-graph nn-sycl-usm-graph-unroll
 #SYCL-USM-GRAPH-OBJECTS = 1Dheat-sycl-usm-graph 1Dheat-sycl-usm-graph-capture iso2dfd-sycl-usm-graph accessors-sycl-usm-graph apsp-sycl-usm-graph dotp-sycl-usm-graph nn-sycl-usm-graph
-SYCL-USM-GRAPH-OBJECTS = 1Dheat-sycl-usm-graph
+SYCL-USM-GRAPH-OBJECTS = 1Dheat-sycl-usm-graph 1Dheat-sycl-usm-graph-capture
 
 #SYCL-BUFFER-GRAPH-OBJECTS = 1Dheat-sycl-buffer-graph accessors-sycl-buffer-graph apsp-sycl-buffer-graph dotp-sycl-buffer-graph iso2dfd-sycl-buffer-graph
 
@@ -107,7 +107,7 @@ run-sycl-usm-graph: $(addprefix run-,$(SYCL-USM-GRAPH-OBJECTS))
 #	$(ENVGRAPH) ./$< 10000 1000
 #
 run-1Dheat-sycl-usm-graph: 1Dheat-sycl-usm-graph
-	./$< 100 10
+	$(ENVGRAPH) ./$< 100 10
 #	$(ENVGRAPH) ./$< 10000 1000
 
 run-1Dheat-sycl-usm-graph-capture: 1Dheat-sycl-usm-graph-capture

From cae401ee41bf8f8cffd4794570e9edc8770c3540 Mon Sep 17 00:00:00 2001
From: u154255 <u154255@s001-n162.aidevcloud>
Date: Wed, 15 Jun 2022 08:47:05 -0700
Subject: [PATCH 19/43] added submit with executable_graph

---
 sycl/include/CL/sycl/queue.hpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/sycl/include/CL/sycl/queue.hpp b/sycl/include/CL/sycl/queue.hpp
index f21ffec77230c..fb2d9976253df 100644
--- a/sycl/include/CL/sycl/queue.hpp
+++ b/sycl/include/CL/sycl/queue.hpp
@@ -81,6 +81,14 @@ class context;
 class device;
 class queue;
 
+namespace ext{
+namespace oneapi{
+namespace experimental{
+class executable_graph;
+}
+}
+}
+
 namespace detail {
 class queue_impl;
 #if __SYCL_USE_FALLBACK_ASSERT
@@ -242,6 +250,10 @@ class __SYCL_EXPORT queue {
   bool device_has(aspect Aspect) const;
 
 public:
+
+  void submit(sycl::ext::oneapi::experimental::executable_graph&);
+
+
   /// Submits a command group function object to the queue, in order to be
   /// scheduled for execution on the device.
   ///

From eead9e52935d49ffd4511872de2229aa729e156f Mon Sep 17 00:00:00 2001
From: u154255 <u154255@s001-n162.aidevcloud>
Date: Wed, 15 Jun 2022 08:47:37 -0700
Subject: [PATCH 20/43] added submit definition

---
 sycl/include/sycl/ext/oneapi/experimental/graph.hpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
index b4e29e132325b..ed1fa5e89edd4 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
@@ -1032,5 +1032,16 @@ void graph::exec_and_wait(sycl::queue q) { my_graph->exec_and_wait(q); };
 } // namespace experimental
 } // namespace oneapi
 } // namespace ext
+
+
+void sycl::queue::submit(sycl::ext::oneapi::experimental::executable_graph& g) {
+  g.exec_and_wait();
+}
+
+
+
+
+
+
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)

From ef1cf00d1d885fdc06dd6f7fa4ee26fdbba9ddb0 Mon Sep 17 00:00:00 2001
From: u154255 <u154255@s001-n162.aidevcloud>
Date: Wed, 15 Jun 2022 09:44:26 -0700
Subject: [PATCH 21/43] added capture_mode

---
 sycl/include/CL/sycl/detail/pi.h                     | 1 +
 sycl/include/CL/sycl/detail/property_helper.hpp      | 3 ++-
 sycl/include/CL/sycl/feature_test.hpp.in             | 1 +
 sycl/include/CL/sycl/properties/queue_properties.hpp | 8 ++++++++
 sycl/plugins/level_zero/pi_level_zero.cpp            | 9 ++++++---
 sycl/source/detail/queue_impl.cpp                    | 3 ++-
 sycl/source/detail/queue_impl.hpp                    | 3 +++
 7 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h
index 2eeadb295d086..92204f669c8c7 100644
--- a/sycl/include/CL/sycl/detail/pi.h
+++ b/sycl/include/CL/sycl/detail/pi.h
@@ -603,6 +603,7 @@ constexpr pi_queue_properties PI_QUEUE_ON_DEVICE = CL_QUEUE_ON_DEVICE;
 constexpr pi_queue_properties PI_QUEUE_ON_DEVICE_DEFAULT =
     CL_QUEUE_ON_DEVICE_DEFAULT;
 constexpr pi_queue_properties PI_QUEUE_LAZY_EXECUTION = 1 << 10;
+constexpr pi_queue_properties PI_QUEUE_CAPTURE_MODE = 1 << 11;ENABLE |
 
 using pi_result = _pi_result;
 using pi_platform_info = _pi_platform_info;
diff --git a/sycl/include/CL/sycl/detail/property_helper.hpp b/sycl/include/CL/sycl/detail/property_helper.hpp
index 9dc34de890c69..ef9c22ae83792 100644
--- a/sycl/include/CL/sycl/detail/property_helper.hpp
+++ b/sycl/include/CL/sycl/detail/property_helper.hpp
@@ -35,8 +35,9 @@ enum DataLessPropKind {
   UseDefaultStream = 8,
   DiscardEvents = 9,
   LazyExecution = 10,
+  CaptureMode = 11,
   // Indicates the last known dataless property.
-  LastKnownDataLessPropKind = 10,
+  LastKnownDataLessPropKind = 11,
   // Exceeding 32 may cause ABI breaking change on some of OSes.
   DataLessPropKindSize = 32
 };
diff --git a/sycl/include/CL/sycl/feature_test.hpp.in b/sycl/include/CL/sycl/feature_test.hpp.in
index c3444c94e98de..40a6e273313ce 100644
--- a/sycl/include/CL/sycl/feature_test.hpp.in
+++ b/sycl/include/CL/sycl/feature_test.hpp.in
@@ -48,6 +48,7 @@ namespace sycl {
 #define SYCL_EXT_ONEAPI_GROUP_ALGORITHMS 1
 #define SYCL_EXT_ONEAPI_GROUP_SORT 1
 #define SYCL_EXT_ONEAPI_LAZY_QUEUE 1
+#define SYCL_EXT_ONEAPI_CAPTURE_MODE 1
 #define SYCL_EXT_ONEAPI_MAX_WORK_GROUP_QUERY 1
 #define SYCL_EXT_ONEAPI_ND_RANGE_REDUCTIONS 1
 #define SYCL_EXT_ONEAPI_DEFAULT_CONTEXT 1
diff --git a/sycl/include/CL/sycl/properties/queue_properties.hpp b/sycl/include/CL/sycl/properties/queue_properties.hpp
index c9fb6e88d890c..75b476a73a428 100644
--- a/sycl/include/CL/sycl/properties/queue_properties.hpp
+++ b/sycl/include/CL/sycl/properties/queue_properties.hpp
@@ -30,6 +30,8 @@ class discard_events : public ::cl::sycl::detail::DataLessProperty<
                            ::cl::sycl::detail::DiscardEvents> {};
 class lazy_execution : public ::cl::sycl::detail::DataLessProperty<
                            ::cl::sycl::detail::LazyExecution> {};
+class capture_mode : public ::cl::sycl::detail::DataLessProperty<
+                           ::cl::sycl::detail::CaptureMode> {};
 } // namespace queue
 } // namespace property
 
@@ -68,6 +70,9 @@ template <>
 struct is_property<ext::oneapi::property::queue::lazy_execution>
     : std::true_type {};
 template <>
+struct is_property<ext::oneapi::property::queue::capture_mode>
+    : std::true_type {};
+template <>
 struct is_property<property::queue::cuda::use_default_stream> : std::true_type {
 };
 template <>
@@ -86,6 +91,9 @@ template <>
 struct is_property_of<ext::oneapi::property::queue::lazy_execution, queue>
     : std::true_type {};
 template <>
+struct is_property_of<ext::oneapi::property::queue::capture_mode, queue>
+    : std::true_type {};
+template <>
 struct is_property_of<property::queue::cuda::use_default_stream, queue>
     : std::true_type {};
 template <>
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index 22c47349ec410..269790fca0fed 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -839,8 +839,10 @@ bool _pi_queue::isInOrderQueue() const {
 }
 
 bool _pi_queue::isEagerExec() const {
-  // If lazy exec queue property is not set, then it's an eager queue.
-  return ((this->Properties & PI_QUEUE_LAZY_EXECUTION ) == 0);
+  // If neither lazy exec nor capture mode 
+  // queue property is set, then it's an eager queue.
+  return (((this->Properties & PI_QUEUE_LAZY_EXECUTION) == 0) || 
+          ((this->Properties & PI_QUEUE_CAPTURE_MODE) == 0));
 }
 
 pi_result _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList,
@@ -3091,7 +3093,8 @@ pi_result piQueueCreate(pi_context Context, pi_device Device,
   PI_ASSERT(
       !(Properties & ~(PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
                        PI_QUEUE_PROFILING_ENABLE | PI_QUEUE_ON_DEVICE |
-                       PI_QUEUE_ON_DEVICE_DEFAULT | PI_QUEUE_LAZY_EXECUTION)),
+                       PI_QUEUE_ON_DEVICE_DEFAULT | PI_QUEUE_LAZY_EXECUTION |
+                       PI_QUEUE_CAPTURE_MODE)),
       PI_INVALID_VALUE);
 
   ze_device_handle_t ZeDevice;
diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index 1fada9904c6cb..ebe3b328a99b0 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -275,7 +275,8 @@ void queue_impl::wait(const detail::code_location &CodeLoc) {
 #endif
 
 #if 1
-  if(has_property<ext::oneapi::property::queue::lazy_execution>()){
+  if(has_property<ext::oneapi::property::queue::lazy_execution>() ||
+     has_property<ext::oneapi::property::queue::capture_mode>()){
     const detail::plugin &Plugin = getPlugin();
     if (Plugin.getBackend() == backend::ext_oneapi_level_zero)
         Plugin.call<detail::PiApiKind::piKernelLaunch>(getHandleRef());
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index 89a1c09c66de7..9288073ff6774 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -308,6 +308,9 @@ class queue_impl {
     if (has_property<ext::oneapi::property::queue::lazy_execution>()) {
       CreationFlags |= PI_QUEUE_LAZY_EXECUTION;
     }
+    if (has_property<ext::oneapi::property::queue::capture_mode>()) {
+      CreationFlags |= PI_QUEUE_CAPTURE_MODE;
+    }
     RT::PiQueue Queue{};
     RT::PiContext Context = MContext->getHandleRef();
     RT::PiDevice Device = MDevice->getHandleRef();

From d396da8bc112e44a4edf94cbc40b93c589a7b192 Mon Sep 17 00:00:00 2001
From: u154255 <u154255@s001-n162.aidevcloud>
Date: Wed, 15 Jun 2022 09:47:32 -0700
Subject: [PATCH 22/43] fixed typo

---
 sycl/include/CL/sycl/detail/pi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h
index 92204f669c8c7..cbe7357e30ad1 100644
--- a/sycl/include/CL/sycl/detail/pi.h
+++ b/sycl/include/CL/sycl/detail/pi.h
@@ -603,7 +603,7 @@ constexpr pi_queue_properties PI_QUEUE_ON_DEVICE = CL_QUEUE_ON_DEVICE;
 constexpr pi_queue_properties PI_QUEUE_ON_DEVICE_DEFAULT =
     CL_QUEUE_ON_DEVICE_DEFAULT;
 constexpr pi_queue_properties PI_QUEUE_LAZY_EXECUTION = 1 << 10;
-constexpr pi_queue_properties PI_QUEUE_CAPTURE_MODE = 1 << 11;ENABLE |
+constexpr pi_queue_properties PI_QUEUE_CAPTURE_MODE = 1 << 11;
 
 using pi_result = _pi_result;
 using pi_platform_info = _pi_platform_info;

From 5ae262863f097546bff4b5ba8dd2ec606bd53438 Mon Sep 17 00:00:00 2001
From: u154255 <u154255@s001-n159.aidevcloud>
Date: Thu, 16 Jun 2022 06:49:29 -0700
Subject: [PATCH 23/43] added two capture APIs

---
 sycl/include/CL/sycl/queue.hpp                      | 4 ++++
 sycl/include/sycl/ext/oneapi/experimental/graph.hpp | 5 ++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/sycl/include/CL/sycl/queue.hpp b/sycl/include/CL/sycl/queue.hpp
index fb2d9976253df..e76c8207d9c77 100644
--- a/sycl/include/CL/sycl/queue.hpp
+++ b/sycl/include/CL/sycl/queue.hpp
@@ -85,6 +85,7 @@ namespace ext{
 namespace oneapi{
 namespace experimental{
 class executable_graph;
+class graph;
 }
 }
 }
@@ -251,7 +252,10 @@ class __SYCL_EXPORT queue {
 
 public:
 
+  sycl::ext::oneapi::experimental::graph* my_graph_ptr;
   void submit(sycl::ext::oneapi::experimental::executable_graph&);
+  void begin_capture(sycl::ext::oneapi::experimental::graph*);
+  void end_capture() const;
 
 
   /// Submits a command group function object to the queue, in order to be
diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
index ed1fa5e89edd4..79311ea4bc7ad 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
@@ -1038,8 +1038,11 @@ void sycl::queue::submit(sycl::ext::oneapi::experimental::executable_graph& g) {
   g.exec_and_wait();
 }
 
+void sycl::queue::begin_capture(sycl::ext::oneapi::experimental::graph* ptr) {
+  my_graph_ptr = ptr;
+}
 
-
+void sycl::queue::end_capture() const {}
 
 
 

From 345683eedd47a91a9f4b0d66e611df1267c9f9bd Mon Sep 17 00:00:00 2001
From: u154255 <u154255@s001-n159.aidevcloud>
Date: Thu, 16 Jun 2022 08:08:01 -0700
Subject: [PATCH 24/43] added is_lazy and is_capture APIs

---
 sycl/include/CL/sycl/queue.hpp | 4 ++++
 sycl/source/queue.cpp          | 8 ++++++++
 2 files changed, 12 insertions(+)

diff --git a/sycl/include/CL/sycl/queue.hpp b/sycl/include/CL/sycl/queue.hpp
index e76c8207d9c77..0601f46d58d33 100644
--- a/sycl/include/CL/sycl/queue.hpp
+++ b/sycl/include/CL/sycl/queue.hpp
@@ -1064,6 +1064,10 @@ class __SYCL_EXPORT queue {
   /// Equivalent to has_property<property::queue::in_order>()
   bool is_in_order() const;
 
+  bool is_lazy() const;
+  bool is_capture() const;
+
+
   /// Returns the backend associated with this queue.
   ///
   /// \return the backend associated with this queue.
diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp
index 46ebc1d3a4d15..95db4c67068c9 100644
--- a/sycl/source/queue.cpp
+++ b/sycl/source/queue.cpp
@@ -194,6 +194,14 @@ bool queue::is_in_order() const {
   return impl->has_property<property::queue::in_order>();
 }
 
+bool queue::is_lazy() const {
+  return impl->has_property<ext::oneapi::property::queue::lazy_execution>();
+}
+
+bool queue::is_capture() const {
+  return impl->has_property<ext::oneapi::property::queue::capture_mode>();
+}
+
 backend queue::get_backend() const noexcept { return getImplBackend(impl); }
 
 pi_native_handle queue::getNative() const { return impl->getNative(); }

From 1aa81e0673ef9c4beac80b97e72934fbefb38813 Mon Sep 17 00:00:00 2001
From: u154255 <u154255@s001-n159.aidevcloud>
Date: Thu, 16 Jun 2022 09:13:02 -0700
Subject: [PATCH 25/43] queue is eager if both properties are not set

---
 sycl/plugins/level_zero/pi_level_zero.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index 269790fca0fed..e73dbbdadda90 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -841,7 +841,7 @@ bool _pi_queue::isInOrderQueue() const {
 bool _pi_queue::isEagerExec() const {
   // If neither lazy exec nor capture mode 
   // queue property is set, then it's an eager queue.
-  return (((this->Properties & PI_QUEUE_LAZY_EXECUTION) == 0) || 
+  return (((this->Properties & PI_QUEUE_LAZY_EXECUTION) == 0) && 
           ((this->Properties & PI_QUEUE_CAPTURE_MODE) == 0));
 }
 

From 238028c30ab59660d0bad424e9dda4e4fd78628e Mon Sep 17 00:00:00 2001
From: u154255 <u154255@s001-n160.aidevcloud>
Date: Fri, 17 Jun 2022 14:35:57 -0700
Subject: [PATCH 26/43] inlined functions in graph.hpp

---
 sycl/include/CL/sycl/queue.hpp                |  28 +++-
 .../sycl/ext/oneapi/experimental/graph.hpp    | 129 +++++++++---------
 2 files changed, 92 insertions(+), 65 deletions(-)

diff --git a/sycl/include/CL/sycl/queue.hpp b/sycl/include/CL/sycl/queue.hpp
index 0601f46d58d33..8e3555fa910d0 100644
--- a/sycl/include/CL/sycl/queue.hpp
+++ b/sycl/include/CL/sycl/queue.hpp
@@ -256,6 +256,9 @@ class __SYCL_EXPORT queue {
   void submit(sycl::ext::oneapi::experimental::executable_graph&);
   void begin_capture(sycl::ext::oneapi::experimental::graph*);
   void end_capture() const;
+  
+  template <typename KernelName = detail::auto_name, typename KernelType>
+  void capture_parallel_for(range<1> NumWorkItems, KernelType);
 
 
   /// Submits a command group function object to the queue, in order to be
@@ -776,7 +779,17 @@ class __SYCL_EXPORT queue {
   event parallel_for(range<1> NumWorkItems,
                      _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc)) {
     _CODELOCARG(&CodeLoc);
-    return parallel_for_impl<KernelName>(NumWorkItems, KernelFunc, CodeLoc);
+    if (!is_capture()) {
+      std::cout << "in queue, use this parallel_for\n\n\n\n\n";
+      return parallel_for_impl<KernelName>(NumWorkItems, KernelFunc, CodeLoc);
+    }
+    
+    //else {
+    //  my_graph_ptr->add_node([=](sycl::handler& h){
+    //    h.template parallel_for<KernelName, KernelType>(NumWorkItems,
+    //                                                    KernelFunc);}, {});
+    //  return sycl::event{};
+    //}
   }
 
   /// parallel_for version with a kernel represented as a lambda + range that
@@ -816,6 +829,7 @@ class __SYCL_EXPORT queue {
   event parallel_for(range<1> NumWorkItems, event DepEvent,
                      _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc)) {
     _CODELOCARG(&CodeLoc);
+    std::cout << "in queue, parallel for with events\n\n";
     return parallel_for_impl<KernelName>(NumWorkItems, DepEvent, KernelFunc,
                                          CodeLoc);
   }
@@ -862,6 +876,7 @@ class __SYCL_EXPORT queue {
   event parallel_for(range<1> NumWorkItems, const std::vector<event> &DepEvents,
                      _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc)) {
     _CODELOCARG(&CodeLoc);
+    std::cout << "in queue, parallel for with vector of events\n\n";
     return parallel_for_impl<KernelName>(NumWorkItems, DepEvents, KernelFunc,
                                          CodeLoc);
   }
@@ -1297,4 +1312,15 @@ template <> struct hash<cl::sycl::queue> {
 };
 } // namespace std
 
+#include <sycl/ext/oneapi/experimental/graph.hpp>
+inline void sycl::queue::submit(sycl::ext::oneapi::experimental::executable_graph& g) {
+  g.exec_and_wait();
+}
+
+inline void sycl::queue::begin_capture(sycl::ext::oneapi::experimental::graph* g) {
+  my_graph_ptr = g;
+}
+
+inline void sycl::queue::end_capture() const {}
+
 #undef __SYCL_USE_FALLBACK_ASSERT
diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
index 79311ea4bc7ad..f9eb1d277252e 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
@@ -55,7 +55,7 @@ struct node_impl {
 
   std::function<void(sycl::handler &)> my_body;
 
-  void exec(sycl::queue q) {
+  inline void exec(sycl::queue q) {
     std::vector<sycl::event> __deps;
     std::vector<node_ptr> pred_nodes = my_predecessors;
     while (!pred_nodes.empty()) {
@@ -72,12 +72,12 @@ struct node_impl {
       my_event = q.submit(wrapper{my_body, __deps});
   }
 
-  void register_successor(node_ptr n) {
+  inline void register_successor(node_ptr n) {
     my_successors.push_back(n);
     n->register_predecessor(node_ptr(this));
   }
 
-  void register_predecessor(node_ptr n) { my_predecessors.push_back(n); }
+  inline void register_predecessor(node_ptr n) { my_predecessors.push_back(n); }
 
   sycl::event get_event(void) { return my_event; }
 
@@ -90,7 +90,7 @@ struct node_impl {
       : is_scheduled(false), is_empty(false), my_graph(g), my_body(cgf) {}
 
   // Recursively adding nodes to execution stack:
-  void topology_sort(std::list<node_ptr> &schedule) {
+  inline void topology_sort(std::list<node_ptr> &schedule) {
     is_scheduled = true;
     for (auto i : my_successors) {
       if (!i->is_scheduled)
@@ -106,7 +106,7 @@ struct graph_impl {
 
   graph_ptr parent;
 
-  void exec(sycl::queue q) {
+  inline void exec(sycl::queue q) {
     if (my_schedule.empty()) {
       for (auto n : my_roots) {
         n->topology_sort(my_schedule);
@@ -116,19 +116,19 @@ struct graph_impl {
       n->exec(q);
   }
 
-  void exec_and_wait(sycl::queue q) {
+  inline void exec_and_wait(sycl::queue q) {
     exec(q);
     q.wait();
   }
 
-  void add_root(node_ptr n) {
+  inline void add_root(node_ptr n) {
     my_roots.insert(n);
     for (auto n : my_schedule)
       n->is_scheduled = false;
     my_schedule.clear();
   }
 
-  void remove_root(node_ptr n) {
+  inline void remove_root(node_ptr n) {
     my_roots.erase(n);
     for (auto n : my_schedule)
       n->is_scheduled = false;
@@ -153,22 +153,22 @@ struct node {
 
   node() : my_node(new detail::node_impl()) {}
 
-  node(detail::graph_ptr g) : my_graph(g), my_node(new detail::node_impl(g)){};
+  node(detail::graph_ptr g) : my_node(new detail::node_impl(g)), my_graph(g){}
 
   template <typename T>
   node(detail::graph_ptr g, T cgf)
-      : my_graph(g), my_node(new detail::node_impl(g, cgf)){};
+      : my_node(new detail::node_impl(g, cgf)), my_graph(g){}
 
   template <typename T> void update(T cgf) {
     my_node->is_scheduled = false;
     my_node->is_empty = false;
     my_node->my_body = cgf;
-  };
+  }
 
-  void register_successor(node n) { my_node->register_successor(n.my_node); }
-  void exec(sycl::queue q, sycl::event = sycl::event()) { my_node->exec(q); }
+  inline void register_successor(node n) { my_node->register_successor(n.my_node); }
+  inline void exec(sycl::queue q, sycl::event = sycl::event()) { my_node->exec(q); }
 
-  void set_root() { my_graph->add_root(my_node); }
+  inline void set_root() { my_graph->add_root(my_node); }
 
   // TODO: Add query functions: is_root, ...
 };
@@ -181,7 +181,7 @@ class executable_graph {
   void exec_and_wait(); // { my_queue.wait(); }
 
   executable_graph(detail::graph_ptr g, sycl::queue q)
-      : my_queue(q), my_tag(rand()) {
+      : my_tag(rand()), my_queue(q) {
     g->exec(my_queue);
   }
 };
@@ -339,7 +339,7 @@ class graph {
   // TODO: Extend queue to directly submit graph
   void exec_and_wait(sycl::queue q);
 
-  executable_graph instantiate(sycl::queue q) {
+  inline executable_graph instantiate(sycl::queue q) {
     return executable_graph{my_graph, q};
   };
 
@@ -354,7 +354,7 @@ class graph {
   detail::graph_ptr my_graph;
 };
 
-void executable_graph::exec_and_wait() { my_queue.wait(); }
+inline void executable_graph::exec_and_wait() { my_queue.wait(); }
 
 /// Adds a node to the graph, in order to be executed upon graph execution.
 ///
@@ -362,7 +362,7 @@ void executable_graph::exec_and_wait() { my_queue.wait(); }
 /// \param dep is a vector of graph nodes the to be added node depends on.
 /// \return a graph node representing the command group operation.
 template <typename T>
-node graph::add_node(T cgf, const std::vector<node> &dep) {
+inline node graph::add_node(T cgf, const std::vector<node> &dep) {
   node _node(my_graph, cgf);
   if (!dep.empty()) {
     for (auto n : dep)
@@ -379,7 +379,7 @@ node graph::add_node(T cgf, const std::vector<node> &dep) {
 /// \param dep is a vector of graph nodes the to be added node depends on.
 /// \return a graph node representing no operations but potentially node
 /// dependencies.
-node graph::add_node(const std::vector<node> &dep) {
+inline node graph::add_node(const std::vector<node> &dep) {
   node _node(my_graph);
   if (!dep.empty()) {
     for (auto n : dep)
@@ -395,7 +395,7 @@ node graph::add_node(const std::vector<node> &dep) {
 /// \param Node is the graph node to be used. This overwrites the node
 /// parameters.
 /// \param dep is a vector of graph nodes the to be added node depends on.
-void graph::add_node(node &Node, const std::vector<node> &dep) {
+inline void graph::add_node(node &Node, const std::vector<node> &dep) {
   Node.my_graph = this->my_graph;
   Node.my_node->my_graph = this->my_graph;
   Node.my_node->is_empty = false;
@@ -414,7 +414,7 @@ void graph::add_node(node &Node, const std::vector<node> &dep) {
 /// \param cgf is a function object containing command group.
 /// \param dep is a vector of graph nodes the to be added node depends on.
 template <typename T>
-void graph::add_node(node &Node, T cgf, const std::vector<node> &dep) {
+inline void graph::add_node(node &Node, T cgf, const std::vector<node> &dep) {
   Node.my_graph = this->my_graph;
   Node.my_node->my_graph = this->my_graph;
   Node.my_node->my_body = cgf;
@@ -431,7 +431,7 @@ void graph::add_node(node &Node, T cgf, const std::vector<node> &dep) {
 ///
 /// \param Node is a graph node to be updated.
 /// \param dep is a vector of graph nodes the to be updated node depends on.
-void graph::update_node(node &Node, const std::vector<node> &dep) {
+inline void graph::update_node(node &Node, const std::vector<node> &dep) {
   Node.my_graph = this->my_graph;
   Node.my_node->my_graph = this->my_graph;
   Node.my_node->is_empty = true;
@@ -449,7 +449,7 @@ void graph::update_node(node &Node, const std::vector<node> &dep) {
 /// \param cgf is a function object containing command group.
 /// \param dep is a vector of graph nodes the to be updated node depends on.
 template <typename T>
-void graph::update_node(node &Node, T cgf, const std::vector<node> &dep) {
+inline void graph::update_node(node &Node, T cgf, const std::vector<node> &dep) {
   Node.my_graph = this->my_graph;
   Node.my_node->my_graph = this->my_graph;
   Node.my_node->my_body = cgf;
@@ -471,7 +471,7 @@ void graph::update_node(node &Node, T cgf, const std::vector<node> &dep) {
 /// \param dep is a vector of graph nodes the fill depends on.
 /// \return a graph node representing the fill operation.
 template <typename T>
-node graph::fill(void *Ptr, const T &Pattern, size_t Count,
+inline node graph::fill(void *Ptr, const T &Pattern, size_t Count,
                  const std::vector<node> &dep) {
   return graph::add_node([=](sycl::handler &h) { h.fill(Ptr, Pattern, Count); },
                          dep);
@@ -487,7 +487,7 @@ node graph::fill(void *Ptr, const T &Pattern, size_t Count,
 /// \param Count is the number of times to fill Pattern into Ptr.
 /// \param dep is a vector of graph nodes the fill depends on.
 template <typename T>
-void graph::fill(node &Node, void *Ptr, const T &Pattern, size_t Count,
+inline void graph::fill(node &Node, void *Ptr, const T &Pattern, size_t Count,
                  const std::vector<node> &dep) {
   graph::update_node(
       Node, [=](sycl::handler &h) { h.fill(Ptr, Pattern, Count); }, dep);
@@ -503,7 +503,7 @@ void graph::fill(node &Node, void *Ptr, const T &Pattern, size_t Count,
 /// \param Src is a USM pointer to the source memory.
 /// \param dep is a vector of graph nodes the memset depends on.
 /// \return a graph node representing the memset operation.
-node graph::memset(void *Ptr, int Value, size_t Count,
+inline node graph::memset(void *Ptr, int Value, size_t Count,
                    const std::vector<node> &dep) {
   return graph::add_node([=](sycl::handler &h) { h.memset(Ptr, Value, Count); },
                          dep);
@@ -520,7 +520,7 @@ node graph::memset(void *Ptr, int Value, size_t Count,
 /// \param Dest is a USM pointer to the destination memory.
 /// \param Src is a USM pointer to the source memory.
 /// \param dep is a vector of graph nodes the memset depends on.
-void graph::memset(node &Node, void *Ptr, int Value, size_t Count,
+inline void graph::memset(node &Node, void *Ptr, int Value, size_t Count,
                    const std::vector<node> &dep) {
   graph::update_node(
       Node, [=](sycl::handler &h) { h.memset(Ptr, Value, Count); }, dep);
@@ -537,7 +537,7 @@ void graph::memset(node &Node, void *Ptr, int Value, size_t Count,
 /// \param Count is a number of bytes to copy.
 /// \param dep is a vector of graph nodes the memcpy depends on.
 /// \return a graph node representing the memcpy operation.
-node graph::memcpy(void *Dest, const void *Src, size_t Count,
+inline node graph::memcpy(void *Dest, const void *Src, size_t Count,
                    const std::vector<node> &dep) {
   return graph::add_node([=](sycl::handler &h) { h.memcpy(Dest, Src, Count); },
                          dep);
@@ -555,7 +555,7 @@ node graph::memcpy(void *Dest, const void *Src, size_t Count,
 /// \param Src is a USM pointer to the source memory.
 /// \param Count is a number of bytes to copy.
 /// \param dep is a vector of graph nodes the memcpy depends on.
-void graph::memcpy(node &Node, void *Dest, const void *Src, size_t Count,
+inline void graph::memcpy(node &Node, void *Dest, const void *Src, size_t Count,
                    const std::vector<node> &dep) {
   graph::update_node(
       Node, [=](sycl::handler &h) { h.memcpy(Dest, Src, Count); }, dep);
@@ -573,7 +573,7 @@ void graph::memcpy(node &Node, void *Dest, const void *Src, size_t Count,
 /// \param dep is a vector of graph nodes the copy depends on.
 /// \return a graph node representing the copy operation.
 template <typename T>
-node graph::copy(const T *Src, T *Dest, size_t Count,
+inline node graph::copy(const T *Src, T *Dest, size_t Count,
                  const std::vector<node> &dep) {
   return graph::add_node(
       [=](sycl::handler &h) { h.memcpy(Dest, Src, Count * sizeof(T)); }, dep);
@@ -592,7 +592,7 @@ node graph::copy(const T *Src, T *Dest, size_t Count,
 /// \param Count is a number of elements of type T to copy.
 /// \param dep is a vector of graph nodes the copy depends on.
 template <typename T>
-void graph::copy(node &Node, const T *Src, T *Dest, size_t Count,
+inline void graph::copy(node &Node, const T *Src, T *Dest, size_t Count,
                  const std::vector<node> &dep) {
   graph::update_node(
       Node, [=](sycl::handler &h) { h.memcpy(Dest, Src, Count * sizeof(T)); },
@@ -607,7 +607,7 @@ void graph::copy(node &Node, const T *Src, T *Dest, size_t Count,
 /// \param Advice is a device-defined advice for the specified allocation.
 /// \param dep is a vector of graph nodes the mem_advise depends on.
 /// \return a graph node representing the mem_advise operation.
-node graph::mem_advise(const void *Ptr, size_t Length, int Advice,
+inline node graph::mem_advise(const void *Ptr, size_t Length, int Advice,
                        const std::vector<node> &dep) {
   return graph::add_node(
       [=](sycl::handler &h) { h.mem_advise(Ptr, Length, Advice); }, dep);
@@ -622,7 +622,7 @@ node graph::mem_advise(const void *Ptr, size_t Length, int Advice,
 /// \param Length is a number of bytes in the allocation.
 /// \param Advice is a device-defined advice for the specified allocation.
 /// \param dep is a vector of graph nodes the mem_advise depends on.
-void graph::mem_advise(node &Node, const void *Ptr, size_t Length, int Advice,
+inline void graph::mem_advise(node &Node, const void *Ptr, size_t Length, int Advice,
                        const std::vector<node> &dep) {
   graph::update_node(
       Node, [=](sycl::handler &h) { h.mem_advise(Ptr, Length, Advice); }, dep);
@@ -636,7 +636,7 @@ void graph::mem_advise(node &Node, const void *Ptr, size_t Length, int Advice,
 /// \param Count is a number of bytes to be prefetched.
 /// \param dep is a vector of graph nodes the prefetch depends on.
 /// \return a graph node representing the prefetch operation.
-node graph::prefetch(const void *Ptr, size_t Count,
+inline node graph::prefetch(const void *Ptr, size_t Count,
                      const std::vector<node> &dep) {
   return graph::add_node([=](sycl::handler &h) { h.prefetch(Ptr, Count); },
                          dep);
@@ -651,7 +651,7 @@ node graph::prefetch(const void *Ptr, size_t Count,
 /// \param Ptr is a USM pointer to the memory to be prefetched to the device.
 /// \param Count is a number of bytes to be prefetched.
 /// \param dep is a vector of graph nodes the prefetch depends on.
-void graph::prefetch(node &Node, const void *Ptr, size_t Count,
+inline void graph::prefetch(node &Node, const void *Ptr, size_t Count,
                      const std::vector<node> &dep) {
   graph::update_node(
       Node, [=](sycl::handler &h) { h.prefetch(Ptr, Count); }, dep);
@@ -663,7 +663,7 @@ void graph::prefetch(node &Node, const void *Ptr, size_t Count,
 /// \param dep is a vector of graph nodes the single_task depends on.
 /// \return a graph node representing the single_task operation.
 template <typename KernelName, typename KernelType>
-node graph::single_task(const KernelType &(KernelFunc),
+inline node graph::single_task(const KernelType &(KernelFunc),
                         const std::vector<node> &dep) {
   return graph::add_node(
       [=](sycl::handler &h) {
@@ -679,7 +679,7 @@ node graph::single_task(const KernelType &(KernelFunc),
 /// \param KernelFunc is the Kernel functor or lambda
 /// \param dep is a vector of graph nodes the single_task depends on.
 template <typename KernelName, typename KernelType>
-void graph::single_task(node &Node, const KernelType &(KernelFunc),
+inline void graph::single_task(node &Node, const KernelType &(KernelFunc),
                         const std::vector<node> &dep) {
   graph::update_node(
       Node,
@@ -697,8 +697,9 @@ void graph::single_task(node &Node, const KernelType &(KernelFunc),
 /// \param dep is a vector of graph nodes the parallel_for depends on
 /// \return a graph node representing the parallel_for operation.
 template <typename KernelName, typename KernelType>
-node graph::parallel_for(range<1> NumWorkItems, const KernelType &(KernelFunc),
+inline node graph::parallel_for(range<1> NumWorkItems, const KernelType &(KernelFunc),
                          const std::vector<node> &dep) {
+  std::cout << "in graph, use this parallel_for\n";
   return graph::add_node(
       [=](sycl::handler &h) {
         h.template parallel_for<KernelName, KernelType>(NumWorkItems,
@@ -716,7 +717,7 @@ node graph::parallel_for(range<1> NumWorkItems, const KernelType &(KernelFunc),
 /// \param KernelFunc is the Kernel functor or lambda
 /// \param dep is a vector of graph nodes the parallel_for depends on
 template <typename KernelName, typename KernelType>
-void graph::parallel_for(node &Node, range<1> NumWorkItems,
+inline void graph::parallel_for(node &Node, range<1> NumWorkItems,
                          const KernelType &(KernelFunc),
                          const std::vector<node> &dep) {
   graph::update_node(
@@ -736,7 +737,7 @@ void graph::parallel_for(node &Node, range<1> NumWorkItems,
 /// \param dep is a vector of graph nodes the parallel_for depends on
 /// \return a graph node representing the parallel_for operation.
 template <typename KernelName, typename KernelType>
-node graph::parallel_for(range<2> NumWorkItems, const KernelType &(KernelFunc),
+inline node graph::parallel_for(range<2> NumWorkItems, const KernelType &(KernelFunc),
                          const std::vector<node> &dep) {
   return graph::add_node(
       [=](sycl::handler &h) {
@@ -755,7 +756,7 @@ node graph::parallel_for(range<2> NumWorkItems, const KernelType &(KernelFunc),
 /// \param KernelFunc is the Kernel functor or lambda
 /// \param dep is a vector of graph nodes the parallel_for depends on
 template <typename KernelName, typename KernelType>
-void graph::parallel_for(node &Node, range<2> NumWorkItems,
+inline void graph::parallel_for(node &Node, range<2> NumWorkItems,
                          const KernelType &(KernelFunc),
                          const std::vector<node> &dep) {
   graph::update_node(
@@ -775,7 +776,7 @@ void graph::parallel_for(node &Node, range<2> NumWorkItems,
 /// \param dep is a vector of graph nodes the parallel_for depends on
 /// \return a graph node representing the parallel_for operation.
 template <typename KernelName, typename KernelType>
-node graph::parallel_for(range<3> NumWorkItems, const KernelType &(KernelFunc),
+inline node graph::parallel_for(range<3> NumWorkItems, const KernelType &(KernelFunc),
                          const std::vector<node> &dep) {
   return graph::add_node(
       [=](sycl::handler &h) {
@@ -794,7 +795,7 @@ node graph::parallel_for(range<3> NumWorkItems, const KernelType &(KernelFunc),
 /// \param KernelFunc is the Kernel functor or lambda
 /// \param dep is a vector of graph nodes the parallel_for depends on
 template <typename KernelName, typename KernelType>
-void graph::parallel_for(node &Node, range<3> NumWorkItems,
+inline void graph::parallel_for(node &Node, range<3> NumWorkItems,
                          const KernelType &(KernelFunc),
                          const std::vector<node> &dep) {
   graph::update_node(
@@ -814,7 +815,7 @@ void graph::parallel_for(node &Node, range<3> NumWorkItems,
 /// \param dep is a vector of graph nodes the parallel_for depends on
 /// \return a graph node representing the parallel_for operation.
 template <typename KernelName, typename KernelType, int Dims>
-node graph::parallel_for(range<Dims> NumWorkItems,
+inline node graph::parallel_for(range<Dims> NumWorkItems,
                          const KernelType &(KernelFunc),
                          const std::vector<node> &dep) {
   return graph::add_node(
@@ -834,7 +835,7 @@ node graph::parallel_for(range<Dims> NumWorkItems,
 /// \param KernelFunc is the Kernel functor or lambda
 /// \param dep is a vector of graph nodes the parallel_for depends on
 template <typename KernelName, typename KernelType, int Dims>
-void graph::parallel_for(node &Node, range<Dims> NumWorkItems,
+inline void graph::parallel_for(node &Node, range<Dims> NumWorkItems,
                          const KernelType &(KernelFunc),
                          const std::vector<node> &dep) {
   graph::update_node(
@@ -855,7 +856,7 @@ void graph::parallel_for(node &Node, range<Dims> NumWorkItems,
 /// \param dep is a vector of graph nodes the parallel_for depends on
 /// \return a graph node representing the parallel_for operation.
 template <typename KernelName, typename KernelType, int Dims>
-node graph::parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
+inline node graph::parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
                          const KernelType &(KernelFunc),
                          const std::vector<node> &dep) {
   return graph::add_node(
@@ -876,7 +877,7 @@ node graph::parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
 /// \param KernelFunc is the Kernel functor or lambda
 /// \param dep is a vector of graph nodes the parallel_for depends on
 template <typename KernelName, typename KernelType, int Dims>
-void graph::parallel_for(node &Node, range<Dims> NumWorkItems,
+inline void graph::parallel_for(node &Node, range<Dims> NumWorkItems,
                          id<Dims> WorkItemOffset,
                          const KernelType &(KernelFunc),
                          const std::vector<node> &dep) {
@@ -898,7 +899,7 @@ void graph::parallel_for(node &Node, range<Dims> NumWorkItems,
 /// \param dep is a vector of graph nodes the parallel_for depends on
 /// \return a graph node representing the parallel_for operation.
 template <typename KernelName, typename KernelType, int Dims>
-node graph::parallel_for(nd_range<Dims> ExecutionRange,
+inline node graph::parallel_for(nd_range<Dims> ExecutionRange,
                          const KernelType &(KernelFunc),
                          const std::vector<node> &dep) {
   return graph::add_node(
@@ -919,7 +920,7 @@ node graph::parallel_for(nd_range<Dims> ExecutionRange,
 /// \param KernelFunc is the Kernel functor or lambda
 /// \param dep is a vector of graph nodes the parallel_for depends on
 template <typename KernelName, typename KernelType, int Dims>
-void graph::parallel_for(node &Node, nd_range<Dims> ExecutionRange,
+inline void graph::parallel_for(node &Node, nd_range<Dims> ExecutionRange,
                          const KernelType &(KernelFunc),
                          const std::vector<node> &dep) {
   graph::update_node(
@@ -941,7 +942,7 @@ void graph::parallel_for(node &Node, nd_range<Dims> ExecutionRange,
 /// \return a graph node representing the parallel_for operation.
 template <typename KernelName, typename KernelType, int Dims,
           typename Reduction>
-node graph::parallel_for(range<Dims> NumWorkItems, Reduction Redu,
+inline node graph::parallel_for(range<Dims> NumWorkItems, Reduction Redu,
                          const KernelType &(KernelFunc),
                          const std::vector<node> &dep) {
   return graph::add_node(
@@ -963,7 +964,7 @@ node graph::parallel_for(range<Dims> NumWorkItems, Reduction Redu,
 /// \param dep is a vector of graph nodes the parallel_for depends on
 template <typename KernelName, typename KernelType, int Dims,
           typename Reduction>
-void graph::parallel_for(node &Node, range<Dims> NumWorkItems, Reduction Redu,
+inline void graph::parallel_for(node &Node, range<Dims> NumWorkItems, Reduction Redu,
                          const KernelType &(KernelFunc),
                          const std::vector<node> &dep) {
   graph::update_node(
@@ -986,7 +987,7 @@ void graph::parallel_for(node &Node, range<Dims> NumWorkItems, Reduction Redu,
 /// \return a graph node representing the parallel_for operation.
 template <typename KernelName, typename KernelType, int Dims,
           typename Reduction>
-node graph::parallel_for(nd_range<Dims> ExecutionRange, Reduction Redu,
+inline node graph::parallel_for(nd_range<Dims> ExecutionRange, Reduction Redu,
                          const KernelType &(KernelFunc),
                          const std::vector<node> &dep) {
   return graph::add_node(
@@ -1009,7 +1010,7 @@ node graph::parallel_for(nd_range<Dims> ExecutionRange, Reduction Redu,
 /// \param dep is a vector of graph nodes the parallel_for depends on
 template <typename KernelName, typename KernelType, int Dims,
           typename Reduction>
-void graph::parallel_for(node &Node, nd_range<Dims> ExecutionRange,
+inline void graph::parallel_for(node &Node, nd_range<Dims> ExecutionRange,
                          Reduction Redu, const KernelType &(KernelFunc),
                          const std::vector<node> &dep) {
   graph::update_node(
@@ -1021,29 +1022,29 @@ void graph::parallel_for(node &Node, nd_range<Dims> ExecutionRange,
       dep);
 }
 
-void graph::make_edge(node sender, node receiver) {
+inline void graph::make_edge(node sender, node receiver) {
   sender.register_successor(receiver);     // register successor
   my_graph->remove_root(receiver.my_node); // remove receiver from root node
                                            // list
 }
 
-void graph::exec_and_wait(sycl::queue q) { my_graph->exec_and_wait(q); };
+inline void graph::exec_and_wait(sycl::queue q) { my_graph->exec_and_wait(q); }
 
 } // namespace experimental
 } // namespace oneapi
 } // namespace ext
 
 
-void sycl::queue::submit(sycl::ext::oneapi::experimental::executable_graph& g) {
-  g.exec_and_wait();
-}
-
-void sycl::queue::begin_capture(sycl::ext::oneapi::experimental::graph* ptr) {
-  my_graph_ptr = ptr;
-}
-
-void sycl::queue::end_capture() const {}
+//void sycl::queue::submit(sycl::ext::oneapi::experimental::executable_graph& g) {
+//  g.exec_and_wait();
+//}
 
+//void sycl::queue::begin_capture(sycl::ext::oneapi::experimental::graph* ptr) {
+//  my_graph_ptr = ptr;
+//}
+//
+//void sycl::queue::end_capture() const {}
+//
 
 
 } // namespace sycl

From b0ff69d38947b09e27c19b93ddfeee22cab6d774 Mon Sep 17 00:00:00 2001
From: u154255 <u154255@s001-n159.aidevcloud>
Date: Wed, 22 Jun 2022 07:01:47 -0700
Subject: [PATCH 27/43] added APIs to query #nodes and edges

---
 .../sycl/ext/oneapi/experimental/graph.hpp        | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
index f9eb1d277252e..abfbe6405c0b5 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
@@ -350,6 +350,9 @@ class graph {
 
   bool is_subgraph();
 
+  size_t num_nodes() const;
+  size_t num_edges() const;
+
 private:
   detail::graph_ptr my_graph;
 };
@@ -1030,6 +1033,18 @@ inline void graph::make_edge(node sender, node receiver) {
 
 inline void graph::exec_and_wait(sycl::queue q) { my_graph->exec_and_wait(q); }
 
+inline size_t graph::num_nodes() const {
+  return my_graph->my_schedule.size();
+}
+
+inline size_t graph::num_edges() const {
+  size_t num_edges = 0;
+  for (auto& root: my_graph->my_roots) {
+    num_edges += root->my_successors.size();
+  }
+  return num_edges;
+}
+
 } // namespace experimental
 } // namespace oneapi
 } // namespace ext

From 53829894d3909a05322e4f931b35f4cc70e01c9b Mon Sep 17 00:00:00 2001
From: u154255 <u154255@s001-n159.aidevcloud>
Date: Wed, 22 Jun 2022 08:32:30 -0700
Subject: [PATCH 28/43] Modefied parallel_for and add_node to support
 q.parallel_for

---
 sycl/include/CL/sycl/queue.hpp                | 20 ++++++++++-
 .../sycl/ext/oneapi/experimental/graph.hpp    | 34 ++++++++++++++-----
 2 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/sycl/include/CL/sycl/queue.hpp b/sycl/include/CL/sycl/queue.hpp
index 8e3555fa910d0..993960c9c62b4 100644
--- a/sycl/include/CL/sycl/queue.hpp
+++ b/sycl/include/CL/sycl/queue.hpp
@@ -258,7 +258,7 @@ class __SYCL_EXPORT queue {
   void end_capture() const;
   
   template <typename KernelName = detail::auto_name, typename KernelType>
-  void capture_parallel_for(range<1> NumWorkItems, KernelType);
+  event parallel_for(range<1> NumWorkItems, const KernelType& KernelFunc);
 
 
   /// Submits a command group function object to the queue, in order to be
@@ -769,6 +769,7 @@ class __SYCL_EXPORT queue {
         CodeLoc);
   }
 
+  /*
   /// parallel_for version with a kernel represented as a lambda + range that
   /// specifies global size only.
   ///
@@ -791,6 +792,7 @@ class __SYCL_EXPORT queue {
     //  return sycl::event{};
     //}
   }
+  */
 
   /// parallel_for version with a kernel represented as a lambda + range that
   /// specifies global size only.
@@ -1323,4 +1325,20 @@ inline void sycl::queue::begin_capture(sycl::ext::oneapi::experimental::graph* g
 
 inline void sycl::queue::end_capture() const {}
 
+template <typename KernelName, typename KernelType>
+sycl::event sycl::queue::parallel_for(range<1> NumWorkItems,
+                   const KernelType& KernelFunc) {
+  if (!is_capture()) {
+    std::cout << "in queue, not use capture mode\n\n";
+    return parallel_for_impl<KernelName>(NumWorkItems, KernelFunc);
+  }
+  
+  else {
+    std::cout << "in queue, use capture mode\n\n";
+    my_graph_ptr->add_node([=](sycl::handler& h){
+      h.template parallel_for<KernelName, KernelType>(NumWorkItems,
+                                                      KernelFunc);}, {}, true);
+    return sycl::event{};
+  }
+}
 #undef __SYCL_USE_FALLBACK_ASSERT
diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
index abfbe6405c0b5..a2dba97853ba2 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
@@ -189,7 +189,7 @@ class executable_graph {
 class graph {
 public:
   // Adds a node
-  template <typename T> node add_node(T cgf, const std::vector<node> &dep = {});
+  template <typename T> node add_node(T cgf, const std::vector<node> &dep = {}, const bool capture=false);
 
   template <typename T>
   void add_node(node &Node, T cgf, const std::vector<node> &dep = {});
@@ -343,7 +343,7 @@ class graph {
     return executable_graph{my_graph, q};
   };
 
-  graph() : my_graph(new detail::graph_impl()) {}
+  graph() : my_graph(new detail::graph_impl()), ptr_prev_node(nullptr) {}
 
   // Creates a subgraph (with predecessors)
   graph(graph &parent, const std::vector<node> &dep = {}) {}
@@ -355,6 +355,7 @@ class graph {
 
 private:
   detail::graph_ptr my_graph;
+  detail::node_ptr ptr_prev_node;
 };
 
 inline void executable_graph::exec_and_wait() { my_queue.wait(); }
@@ -365,13 +366,30 @@ inline void executable_graph::exec_and_wait() { my_queue.wait(); }
 /// \param dep is a vector of graph nodes the to be added node depends on.
 /// \return a graph node representing the command group operation.
 template <typename T>
-inline node graph::add_node(T cgf, const std::vector<node> &dep) {
+inline node graph::add_node(T cgf, const std::vector<node> &dep, const bool capture) {
   node _node(my_graph, cgf);
-  if (!dep.empty()) {
-    for (auto n : dep)
-      this->make_edge(n, _node);
-  } else {
-    _node.set_root();
+  if (!capture) {
+    if (!dep.empty()) {
+      for (auto n : dep)
+        this->make_edge(n, _node);
+    } else {
+      _node.set_root();
+    }
+  }
+  else {
+    // first node ever
+    if (!ptr_prev_node) {
+      std::cout << "ptr_prev_node = " << ptr_prev_node << '\n';
+      _node.set_root();
+      ptr_prev_node = _node.my_node;
+      std::cout << "finish if\n";
+    }
+    else {
+      std::cout << "ptr_prev_node = " << ptr_prev_node << '\n';
+      ptr_prev_node->register_successor(_node.my_node);
+      ptr_prev_node = _node.my_node;
+      std::cout << "finish else\n";
+    }
   }
   return _node;
 }

From 73324ca927a8ba893d0af81ad30a4a841e0edd82 Mon Sep 17 00:00:00 2001
From: u154255 <u154255@s001-n159.aidevcloud>
Date: Wed, 29 Jun 2022 09:04:26 -0700
Subject: [PATCH 29/43] q.parallel_for can take one event

---
 sycl/include/CL/sycl/queue.hpp                | 56 ++++++++++++++++---
 .../sycl/ext/oneapi/experimental/graph.hpp    |  3 +
 2 files changed, 52 insertions(+), 7 deletions(-)

diff --git a/sycl/include/CL/sycl/queue.hpp b/sycl/include/CL/sycl/queue.hpp
index 993960c9c62b4..75282982cf73d 100644
--- a/sycl/include/CL/sycl/queue.hpp
+++ b/sycl/include/CL/sycl/queue.hpp
@@ -260,6 +260,8 @@ class __SYCL_EXPORT queue {
   template <typename KernelName = detail::auto_name, typename KernelType>
   event parallel_for(range<1> NumWorkItems, const KernelType& KernelFunc);
 
+  template <typename KernelName = detail::auto_name, typename KernelType>
+  event parallel_for(range<1> NumWorkItems, event DepEvent, const KernelType& KernelFunc);
 
   /// Submits a command group function object to the queue, in order to be
   /// scheduled for execution on the device.
@@ -820,6 +822,7 @@ class __SYCL_EXPORT queue {
     return parallel_for_impl<KernelName>(NumWorkItems, KernelFunc, CodeLoc);
   }
 
+  /*
   /// parallel_for version with a kernel represented as a lambda + range that
   /// specifies global size only.
   ///
@@ -835,6 +838,7 @@ class __SYCL_EXPORT queue {
     return parallel_for_impl<KernelName>(NumWorkItems, DepEvent, KernelFunc,
                                          CodeLoc);
   }
+  */
 
   /// parallel_for version with a kernel represented as a lambda + range that
   /// specifies global size only.
@@ -1183,7 +1187,7 @@ class __SYCL_EXPORT queue {
             int Dims>
   event parallel_for_impl(range<Dims> NumWorkItems, event DepEvent,
                           KernelType KernelFunc,
-                          const detail::code_location &CodeLoc) {
+                          const detail::code_location &CodeLoc = detail::code_location::current()) {
     return submit(
         [&](handler &CGH) {
           CGH.depends_on(DepEvent);
@@ -1315,19 +1319,23 @@ template <> struct hash<cl::sycl::queue> {
 } // namespace std
 
 #include <sycl/ext/oneapi/experimental/graph.hpp>
-inline void sycl::queue::submit(sycl::ext::oneapi::experimental::executable_graph& g) {
+inline void sycl::queue::submit(
+  sycl::ext::oneapi::experimental::executable_graph& g) {
   g.exec_and_wait();
 }
 
-inline void sycl::queue::begin_capture(sycl::ext::oneapi::experimental::graph* g) {
+inline void sycl::queue::begin_capture(
+  sycl::ext::oneapi::experimental::graph* g) {
   my_graph_ptr = g;
 }
 
 inline void sycl::queue::end_capture() const {}
 
 template <typename KernelName, typename KernelType>
-sycl::event sycl::queue::parallel_for(range<1> NumWorkItems,
-                   const KernelType& KernelFunc) {
+sycl::event sycl::queue::parallel_for(
+  range<1> NumWorkItems,
+  const KernelType& KernelFunc) {
+  
   if (!is_capture()) {
     std::cout << "in queue, not use capture mode\n\n";
     return parallel_for_impl<KernelName>(NumWorkItems, KernelFunc);
@@ -1336,9 +1344,43 @@ sycl::event sycl::queue::parallel_for(range<1> NumWorkItems,
   else {
     std::cout << "in queue, use capture mode\n\n";
     my_graph_ptr->add_node([=](sycl::handler& h){
-      h.template parallel_for<KernelName, KernelType>(NumWorkItems,
-                                                      KernelFunc);}, {}, true);
+      h.template parallel_for<KernelName, KernelType>(
+        NumWorkItems, KernelFunc);
+      }, 
+      {},
+      true
+    );
+   
     return sycl::event{};
   }
 }
+
+
+template <typename KernelName, typename KernelType>
+sycl::event sycl::queue::parallel_for(
+  sycl::range<1> NumWorkItems,
+  sycl::event DepEvent, 
+  const KernelType& KernelFunc) {
+
+  if (!is_capture()) {
+    std::cout << "in queue with one event, not use capture mode\n\n";
+    return parallel_for_impl<KernelName>(NumWorkItems, DepEvent, KernelFunc);
+  }
+  
+  else {
+    std::cout << "in queue with one event, use capture mode\n\n";
+    my_graph_ptr->add_node([=](sycl::handler& h){
+      h.depends_on(DepEvent);
+      h.template parallel_for<KernelName, KernelType>(
+        NumWorkItems, KernelFunc);
+      }, 
+      {},
+      true
+    );
+    //auto e = parallel_for_impl<KernelName>(NumWorkItems, DepEvent, KernelFunc);
+    
+    return sycl::event{};
+  }
+}
+
 #undef __SYCL_USE_FALLBACK_ASSERT
diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
index a2dba97853ba2..2decd10e39044 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
@@ -12,6 +12,7 @@
 
 #include <list>
 #include <set>
+#include <map>
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
@@ -188,6 +189,7 @@ class executable_graph {
 
 class graph {
 public:
+  
   // Adds a node
   template <typename T> node add_node(T cgf, const std::vector<node> &dep = {}, const bool capture=false);
 
@@ -1063,6 +1065,7 @@ inline size_t graph::num_edges() const {
   return num_edges;
 }
 
+
 } // namespace experimental
 } // namespace oneapi
 } // namespace ext

From 75ac07de53092ce70129a7edbe820c86603a39a5 Mon Sep 17 00:00:00 2001
From: u154255 <u154255@s001-n159.aidevcloud>
Date: Wed, 29 Jun 2022 09:39:25 -0700
Subject: [PATCH 30/43] q.parallel_for can take events

---
 sycl/include/CL/sycl/queue.hpp | 36 ++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/sycl/include/CL/sycl/queue.hpp b/sycl/include/CL/sycl/queue.hpp
index 75282982cf73d..2c2b006f49275 100644
--- a/sycl/include/CL/sycl/queue.hpp
+++ b/sycl/include/CL/sycl/queue.hpp
@@ -263,6 +263,9 @@ class __SYCL_EXPORT queue {
   template <typename KernelName = detail::auto_name, typename KernelType>
   event parallel_for(range<1> NumWorkItems, event DepEvent, const KernelType& KernelFunc);
 
+  template <typename KernelName = detail::auto_name, typename KernelType>
+  event parallel_for(range<1> NumWorkItems, const std::vector<event>& DepEvents, const KernelType& KernelFunc);
+  
   /// Submits a command group function object to the queue, in order to be
   /// scheduled for execution on the device.
   ///
@@ -870,6 +873,7 @@ class __SYCL_EXPORT queue {
                                          CodeLoc);
   }
 
+  /*
   /// parallel_for version with a kernel represented as a lambda + range that
   /// specifies global size only.
   ///
@@ -886,6 +890,7 @@ class __SYCL_EXPORT queue {
     return parallel_for_impl<KernelName>(NumWorkItems, DepEvents, KernelFunc,
                                          CodeLoc);
   }
+  */
 
   /// parallel_for version with a kernel represented as a lambda + range that
   /// specifies global size only.
@@ -1383,4 +1388,35 @@ sycl::event sycl::queue::parallel_for(
   }
 }
 
+template <typename KernelName, typename KernelType>
+sycl::event sycl::queue::parallel_for(
+  sycl::range<1> NumWorkItems,
+  const std::vector<sycl::event> &DepEvents,
+  const KernelType& KernelFunc) {
+  
+  if (!is_capture()) {
+    std::cout << "in queue with events, not use capture mode\n\n";
+    return parallel_for_impl<KernelName>(
+      NumWorkItems, 
+      DepEvents, 
+      KernelFunc,
+      detail::code_location::current());
+  }
+
+  else {
+    std::cout << "in queue with events, use capture mode\n\n";
+    my_graph_ptr->add_node([=](sycl::handler& h){
+      h.depends_on(DepEvents);
+      h.template parallel_for<KernelName, KernelType>(
+        NumWorkItems, KernelFunc);
+      }, 
+      {},
+      true
+    );
+    //auto e = parallel_for_impl<KernelName>(NumWorkItems, DepEvents, KernelFunc);
+    
+    return sycl::event{};
+  }
+}
+
 #undef __SYCL_USE_FALLBACK_ASSERT

From 1de215ed9d457b2033d6e0f853fecadb82b7587b Mon Sep 17 00:00:00 2001
From: u154255 <u154255@s001-n157.aidevcloud>
Date: Wed, 6 Jul 2022 12:01:00 -0700
Subject: [PATCH 31/43] added unittest.cpp

---
 sycl/examples/Makefile     |  57 +++--
 sycl/examples/run.sh       |  40 ++--
 sycl/examples/unittest.cpp | 441 +++++++++++++++++++++++++++++++++++++
 3 files changed, 511 insertions(+), 27 deletions(-)
 create mode 100644 sycl/examples/unittest.cpp

diff --git a/sycl/examples/Makefile b/sycl/examples/Makefile
index 0dda564533baf..fd8861d9889dd 100644
--- a/sycl/examples/Makefile
+++ b/sycl/examples/Makefile
@@ -7,9 +7,13 @@
 
 CXX = clang++
 #CPPFLAGS = -fsycl -std=c++17 -O3 -fsycl-device-code-split=off -fsycl-early-optimizations -fsycl-dead-args-optimization -Wall -Wpedantic -v
-CPPFLAGS = -fsycl -std=c++17 -g -fsycl-unnamed-lambda
+CPPFLAGS = -fsycl -std=c++17 -g -fsycl-unnamed-lambda -v
 CPPFLAGSGRAPH = -D SYCL_EXT_ONEAPI_LAZY_QUEUE=1
-CPPINCLUDEPATH = -I${DPCPP_HOME}/llvm/sycl/include -I${DPCPP_HOME}/llvm/include
+#CPPINCLUDEPATH = -I${DPCPP_HOME}/llvm/sycl/include -I${DPCPP_HOME}/llvm/include
+CPPINCLUDEPATH = -I${DPCPP_HOME}/llvm/sycl/include
+GTESTINCLUDEPATH = -I${DPCPP_HOME}/../../googletest/googletest/include
+GTESTLIB = -lgtest_main -lgtest -lpthread -L${DPCPP_HOME}/../../googletest/build/lib
+
 # Enables AOT compilation. Specify the target device. Disable AOT and use JIT by setting DEVICE to an empty string.
 CPPFLAGSAOT = -fsycl-targets=spir64_gen -Xs "-device Gen9"  # Intel Gen9 GPU
 # CPPFLAGSAOT = -fsycl-targets=nvptx64-nvidia-cuda  # NVIDIA GPU
@@ -26,7 +30,7 @@ NVFLAGS = -O3 -lineinfo -arch=sm_75
 
 SRCS = $(wildcard *.cpp) $(wildcard *.hpp) $(wildcard *.cu)
 
-#1DHEAT-OBJECTS = 1Dheat-sycl-buffer 1Dheat-sycl-usm 1Dheat-sycl-buffer-graph 1Dheat-sycl-usm-graph 1Dheat-sycl-usm-graph-capture 1Dheat-sycl-usm-graph-unroll
+#1DHEAT-OBJECTS = 1Dheat-sycl-buffer 1Dheat-sycl-usm 1Dheat-sycl-buffer-graph 1Dheat-sycl-usm-graph 1Dheat-sycl-usm-graph-capture 1Dheat-sycl-usm-graph-unroll 1Dheat-sycl-usm-graph-capture-test
 
 #ACCESSORS-OBJECTS = accessors-sycl-buffer accessors-sycl-usm accessors-sycl-buffer-graph accessors-sycl-usm-graph
 
@@ -44,11 +48,14 @@ SRCS = $(wildcard *.cpp) $(wildcard *.hpp) $(wildcard *.cu)
 
 #SYCL-USM-GRAPH-OBJECTS = 1Dheat-sycl-usm-graph 1Dheat-sycl-usm-graph-capture 1Dheat-sycl-usm-graph-unroll accessors-sycl-usm-graph apsp-sycl-usm-graph apsp-sycl-usm-graph-capture apsp-sycl-usm-graph-unroll dotp-sycl-usm-graph dotp-sycl-usm-graph-capture iso2dfd-sycl-usm-graph iso2dfd-sycl-usm-graph-capture iso2dfd-sycl-usm-graph-unroll nn-sycl-usm-graph nn-sycl-usm-graph-unroll
 #SYCL-USM-GRAPH-OBJECTS = 1Dheat-sycl-usm-graph 1Dheat-sycl-usm-graph-capture iso2dfd-sycl-usm-graph accessors-sycl-usm-graph apsp-sycl-usm-graph dotp-sycl-usm-graph nn-sycl-usm-graph
-SYCL-USM-GRAPH-OBJECTS = 1Dheat-sycl-usm-graph 1Dheat-sycl-usm-graph-capture
+SYCL-USM-GRAPH-OBJECTS = 1Dheat-sycl-usm-graph-capture-test
+#SYCL-USM-GRAPH-OBJECTS = 1Dheat-sycl-usm-graph
+
+DPL-USM-POINTER-OBJECTS = dpl-usm-pointer
 
 #SYCL-BUFFER-GRAPH-OBJECTS = 1Dheat-sycl-buffer-graph accessors-sycl-buffer-graph apsp-sycl-buffer-graph dotp-sycl-buffer-graph iso2dfd-sycl-buffer-graph
 
-.PHONY: all 1Dheat accessors apsp dotp iso2dfd nn sycl-graph sycl-usm-graph sycl-buffer-graph run run-1Dheat run-accessors run-apsp run-dotp run-iso2dfd run-nn run-sycl-usm-graph run-sycl-buffer-graph format tidy clean
+.PHONY: all 1Dheat accessors apsp dotp iso2dfd nn sycl-graph sycl-usm-graph sycl-buffer-graph run run-1Dheat run-accessors run-apsp run-dotp run-iso2dfd run-nn run-sycl-usm-graph run-sycl-buffer-graph format tidy clean dpl-usm-pointer-graph-capture run-dpl-usm-pointer-graph-capture
 
 #all: 1Dheat accessors apsp dotp iso2dfd nn
 #1Dheat: $(1DHEAT-OBJECTS)
@@ -64,6 +71,7 @@ SYCL-USM-GRAPH-OBJECTS = 1Dheat-sycl-usm-graph 1Dheat-sycl-usm-graph-capture
 #
 #sycl-graph: sycl-usm-graph sycl-buffer-graph
 sycl-usm-graph: $(SYCL-USM-GRAPH-OBJECTS)
+dpl-usm-pointer-graph-capture: $(DPL-USM-POINTER-OBJECTS)
 #sycl-buffer-graph: $(SYCL-BUFFER-GRAPH-OBJECTS)
 #
 #run: run-1Dheat run-accessors run-apsp run-dotp run-iso2dfd run-nn
@@ -80,19 +88,27 @@ sycl-usm-graph: $(SYCL-USM-GRAPH-OBJECTS)
 #
 #run-sycl-graph: run-sycl-usm-graph run-sycl-buffer-graph
 run-sycl-usm-graph: $(addprefix run-,$(SYCL-USM-GRAPH-OBJECTS))
+run-dpl-usm-pointer-graph-capture: $(addprefix run-,$(DPL-USM-POINTER-OBJECTS))
 #run-sycl-buffer-graph: $(addprefix run-,$(SYCL-BUFFER-GRAPH-OBJECTS))
 #
 #%-sycl-buffer: %-sycl-buffer.cpp
 #	$(CXX) $(CPPFLAGS) $(CPPFLAGSAOT) $< -o $@
 #
-#%-sycl-usm: %-sycl-usm.cpp
-#	$(CXX) $(CPPFLAGS) $(CPPFLAGSAOT) $< -o $@
-#
+%-sycl-usm: %-sycl-usm.cpp
+	$(CXX) $(CPPFLAGS) $(CPPINCLUDEPATH) $< -o $@
+
 %-sycl-usm-graph: %-sycl-usm-graph.cpp
 	$(CXX) $(CPPFLAGS) $(CPPINCLUDEPATH) $< -o $@
 
 %-sycl-usm-graph-capture: %-sycl-usm-graph-capture.cpp
 	$(CXX) $(CPPFLAGS) $(CPPINCLUDEPATH) $< -o $@
+
+%-sycl-usm-graph-capture-test: %-sycl-usm-graph-capture-test.cpp
+	$(CXX) $(CPPFLAGS) $(CPPINCLUDEPATH) $< -o $@
+
+dpl-usm-pointer: dpl-usm-pointer.cpp
+	$(CXX) $(CPPFLAGS) $(CPPINCLUDEPATH) $< -o $@
+
 #
 #%-sycl-usm-graph-unroll: %-sycl-usm-graph-unroll.cpp
 #	$(CXX) $(CPPFLAGS) $(CPPFLAGSAOT) $(CPPFLAGSGRAPH) $< -o $@
@@ -100,20 +116,25 @@ run-sycl-usm-graph: $(addprefix run-,$(SYCL-USM-GRAPH-OBJECTS))
 #run-1Dheat-sycl-buffer: 1Dheat-sycl-buffer
 #	./$< 10000 1000
 #
-#run-1Dheat-sycl-usm: 1Dheat-sycl-usm
-#	./$< 10000 1000
+run-1Dheat-sycl-usm: 1Dheat-sycl-usm
+	./$< 10 3 
 #
 #run-1Dheat-sycl-buffer-graph: 1Dheat-sycl-buffer-graph
 #	$(ENVGRAPH) ./$< 10000 1000
 #
 run-1Dheat-sycl-usm-graph: 1Dheat-sycl-usm-graph
-	$(ENVGRAPH) ./$< 100 10
+	$(ENVGRAPH) ./$< 10 1
 #	$(ENVGRAPH) ./$< 10000 1000
 
 run-1Dheat-sycl-usm-graph-capture: 1Dheat-sycl-usm-graph-capture
-	$(ENVGRAPH) ./$< 1000 1
+	$(ENVGRAPH) ./$< 10 1
 
+run-1Dheat-sycl-usm-graph-capture-test: 1Dheat-sycl-usm-graph-capture-test
+	$(ENVGRAPH) ./$< 10 3
 
+run-dpl-usm-pointer: dpl-usm-pointer
+	./$<
+   
 #run-1Dheat-sycl-usm-graph-unroll: 1Dheat-sycl-usm-graph-unroll
 #	$(ENVGRAPH) ./$< 10000 1000
 #
@@ -172,6 +193,16 @@ run-%-sycl-usm-graph: %-sycl-usm-graph
 #run-%-cuda-graphs-capture: %-cuda-graphs-capture
 #	./$<
 
+test: unittest
+  
+
+#unittest: unittest.cpp
+#	$(CXX) $(CPPFLAGS) $(CPPINCLUDEPATH) -I/home/u154255/googletest/googletest/include -lgtest_main -lgtest -L/home/u154255/googletest/build/lib -lpthread $< -o $@
+  
+unittest: unittest.cpp
+	$(CXX) $(CPPFLAGS) $(CPPINCLUDEPATH) $(GTESTINCLUDEPATH) $(GTESTLIB) $< -o $@
+
+
 format:
 	clang-format -i $(SRCS)
 
@@ -179,5 +210,5 @@ tidy:
 	clang-tidy -header-filter=.* $(SRCS) -- -I${LLVMBUILDDIR}/include -I${LLVMBUILDDIR}/include/sycl
 
 clean:
-	rm -f $(1DHEAT-OBJECTS) $(ACCESSORS-OBJECTS) $(APSP-OBJECTS) $(DOTP-OBJECTS) $(ISO2DFD-OBJECTS) $(NN-OBJECTS) wavefield_snapshot.bin wavefield_snapshot_cpu.bin *_error_diff.txt
+	rm -f $(1DHEAT-OBJECTS) $(ACCESSORS-OBJECTS) $(APSP-OBJECTS) $(DOTP-OBJECTS) $(ISO2DFD-OBJECTS) $(NN-OBJECTS) $(DPL-USM-POINTER-OBJECTS) wavefield_snapshot.bin wavefield_snapshot_cpu.bin *_error_diff.txt unittest
 
diff --git a/sycl/examples/run.sh b/sycl/examples/run.sh
index 814078cddc2a4..48ce967963d3d 100755
--- a/sycl/examples/run.sh
+++ b/sycl/examples/run.sh
@@ -18,20 +18,32 @@ export LD_LIBRARY_PATH=$LLVMBUILDDIR/lib:$LD_LIBRARY_PATH
 # clean all the executable and output texts
 make clean
 
-# compile *-sycl-usm-graph source code
-make sycl-usm-graph
-
-# execute *-sycl-usm-graph executables
-echo ""
-echo "------ Start  running executables ------"
-echo ""
-make run-sycl-usm-graph
-echo ""
-echo "------ Finish running executables ------"
-echo ""
-
-
-
+## compile *-sycl-usm-graph source code
+#make sycl-usm-graph
+#
+## execute *-sycl-usm-graph executables
+#echo ""
+#echo "------ Start  running executables ------"
+#echo ""
+#make run-sycl-usm-graph
+#echo ""
+#echo "------ Finish running executables ------"
+#echo ""
+ 
+##compile *dpl-usm-pointer source code
+#make dpl-usm-pointer-graph-capture
+#
+## execute *-dpl-usm-pointer executables
+#echo ""
+#echo "------ Start  running executables ------"
+#echo ""
+#make run-dpl-usm-pointer-graph-capture
+#echo ""
+#echo "------ Finish running executables ------"
+#echo ""
+
+make test
+./unittest
 
 #export LD_LIBRARY_PATH=$LLVMBUILDDIR/lib:$LD_LIBRARY_PATH
 #rm -rf ./a.out
diff --git a/sycl/examples/unittest.cpp b/sycl/examples/unittest.cpp
new file mode 100644
index 0000000000000..5a3aa749f8d40
--- /dev/null
+++ b/sycl/examples/unittest.cpp
@@ -0,0 +1,441 @@
+//==============================================================
+// Copyright © 2020 Intel Corporation
+//
+// SPDX-License-Identifier: MIT
+// =============================================================
+
+#include <oneapi/dpl/algorithm>
+#include <oneapi/dpl/execution>
+
+#include <CL/sycl.hpp>
+
+#include <gtest/gtest.h>
+
+using namespace oneapi::dpl::execution;
+using namespace sycl;
+
+
+const int initial_value = 10;
+const int array_size = 6;
+
+
+// one parallel_for in the capture window 
+TEST(CaptureGraph, one_parallel_for) {
+  sycl::property_list properties{
+    sycl::property::queue::in_order(),
+    sycl::ext::oneapi::property::queue::capture_mode{}};
+  
+  sycl::default_selector device_selector;
+  
+  sycl::queue q(device_selector, properties);
+    
+  int* data = sycl::malloc_shared<int>(array_size, q);
+  
+  for (int i = 0; i < array_size; i++) {
+    data[i] = initial_value;
+  }
+  
+  for (int i = 0; i < array_size; ++i) {
+    EXPECT_EQ(initial_value, data[i])
+    << "Initialize Error: data[" << i << "] != " << initial_value;    
+  }
+
+  sycl::ext::oneapi::experimental::graph g;
+
+  q.begin_capture(&g);
+  
+  auto e =
+  q.parallel_for(
+    range<1>{array_size},
+    [=](id<1> idx) {
+      data[idx] = data[idx] + idx;
+    }
+  );
+ 
+  for (int i = 0; i < array_size; ++i) {
+    EXPECT_EQ(initial_value, data[i])
+    << "Error: PSTL executes in the capture window.";    
+  }
+  
+  q.end_capture();
+  
+  auto exec_graph = g.instantiate(q);
+
+  q.submit(exec_graph);
+    
+  for (int i = 0; i < array_size; i++) {
+    EXPECT_EQ(initial_value + i, data[i]) 
+    << "Execution Error: data[" << i << "] != " << initial_value + i;
+  }
+  
+  free(data, q);
+}
+
+
+// one parallel_for and wait in the capture window 
+TEST(CaptureGraph, one_parallel_for_wait) {
+  sycl::property_list properties{
+    sycl::property::queue::in_order(),
+    sycl::ext::oneapi::property::queue::capture_mode{}};
+  
+  sycl::default_selector device_selector;
+  
+  sycl::queue q(device_selector, properties);
+    
+  int* data = sycl::malloc_shared<int>(array_size, q);
+  
+  for (int i = 0; i < array_size; i++) {
+    data[i] = initial_value;
+  }
+  
+  for (int i = 0; i < array_size; ++i) {
+    EXPECT_EQ(initial_value, data[i])
+    << "Initialize Error: data[" << i << "] != " << initial_value;    
+  }
+
+  sycl::ext::oneapi::experimental::graph g;
+
+  q.begin_capture(&g);
+ 
+  auto e =  
+  q.parallel_for(
+    range<1>{array_size},
+    [=](id<1> idx) {
+      data[idx] = data[idx] + idx;
+    }
+  );
+
+  e.wait();
+ 
+  for (int i = 0; i < array_size; ++i) {
+    EXPECT_EQ(initial_value, data[i])
+    << "Error: PSTL executes in the capture window.";    
+  }
+  
+  q.end_capture();
+  
+  auto exec_graph = g.instantiate(q);
+
+  q.submit(exec_graph);
+    
+  for (int i = 0; i < array_size; i++) {
+    EXPECT_EQ(initial_value + i, data[i]) 
+    << "Execution Error: data[" << i << "] != " << initial_value + i;
+  }
+  
+  free(data, q);
+}
+
+
+// two parallel_for, two wait and one dependent in the capture window 
+TEST(CaptureGraph, two_parallel_for_wait_dep) {
+  sycl::property_list properties{
+    sycl::property::queue::in_order(),
+    sycl::ext::oneapi::property::queue::capture_mode{}};
+  
+  sycl::default_selector device_selector;
+  
+  sycl::queue q(device_selector, properties);
+    
+  int* data = sycl::malloc_shared<int>(array_size, q);
+  
+  for (int i = 0; i < array_size; i++) {
+    data[i] = initial_value;
+  }
+  
+  for (int i = 0; i < array_size; ++i) {
+    EXPECT_EQ(initial_value, data[i])
+    << "Initialize Error: data[" << i << "] != " << initial_value;    
+  }
+
+  sycl::ext::oneapi::experimental::graph g;
+
+  q.begin_capture(&g);
+ 
+  auto e1 =  
+  q.parallel_for(
+    range<1>{array_size},
+    [=](id<1> idx) {
+      data[idx] = data[idx] + idx;
+    }
+  );
+
+  e1.wait();
+ 
+  for (int i = 0; i < array_size; ++i) {
+    EXPECT_EQ(initial_value, data[i])
+    << "Error: PSTL executes in the capture window.";    
+  }
+  
+  auto e2 =  
+  q.parallel_for(
+    range<1>{array_size},
+    {e1},
+    [=](id<1> idx) {
+      data[idx] = data[idx] * 2;
+    }
+  );
+
+  e2.wait();
+  
+  for (int i = 0; i < array_size; ++i) {
+    EXPECT_EQ(initial_value, data[i])
+    << "Error: PSTL executes in the capture window.";    
+  }
+  
+  q.end_capture();
+  
+  auto exec_graph = g.instantiate(q);
+
+  q.submit(exec_graph);
+    
+  for (int i = 0; i < array_size; i++) {
+    EXPECT_EQ(2*(initial_value + i), data[i]) 
+    << "Execution Error: data[" << i << "] != " << 2*(initial_value + i);
+  }
+  
+  free(data, q);
+}
+
+
+// one submit in the capture window 
+TEST(CaptureGraph, one_submit) {
+  sycl::property_list properties{
+    sycl::property::queue::in_order(),
+    sycl::ext::oneapi::property::queue::capture_mode{}};
+  
+  sycl::default_selector device_selector;
+  
+  sycl::queue q(device_selector, properties);
+    
+  int* data = sycl::malloc_shared<int>(array_size, q);
+  
+  for (int i = 0; i < array_size; i++) {
+    data[i] = initial_value;
+  }
+  
+  for (int i = 0; i < array_size; ++i) {
+    EXPECT_EQ(initial_value, data[i])
+    << "Initialize Error: data[" << i << "] != " << initial_value;    
+  }
+
+  sycl::ext::oneapi::experimental::graph g;
+
+  q.begin_capture(&g);
+  
+  auto e =
+  q.submit([&](sycl::handler& h){
+    h.parallel_for(
+      range<1>{array_size},
+      [=](id<1> idx) {
+        data[idx] = data[idx] + idx;
+      }
+    );
+  });
+ 
+  for (int i = 0; i < array_size; ++i) {
+    EXPECT_EQ(initial_value, data[i])
+    << "Error: PSTL executes in the capture window.";    
+  }
+  
+  q.end_capture();
+  
+  auto exec_graph = g.instantiate(q);
+
+  q.submit(exec_graph);
+    
+  for (int i = 0; i < array_size; i++) {
+    EXPECT_EQ(initial_value + i, data[i]) 
+    << "Execution Error: data[" << i << "] != " << initial_value + i;
+  }
+  
+  free(data, q);
+}
+
+
+
+/*
+ * The followings capture external libraries
+ */
+
+// one PSTL in the capture window
+TEST(CaptureGraph, one_PSTL) {
+  sycl::property_list properties{
+    sycl::property::queue::in_order(),
+    sycl::ext::oneapi::property::queue::capture_mode{}};
+  
+  sycl::default_selector device_selector;
+  
+  sycl::queue q(device_selector, properties);
+    
+  int* data = sycl::malloc_shared<int>(array_size, q);
+  
+  for (int i = 0; i < array_size; i++) {
+    data[i] = initial_value;
+  }
+  
+  for (int i = 0; i < array_size; ++i) {
+    EXPECT_EQ(initial_value, data[i])
+    << "Initialize Error: data[" << i << "] != " << initial_value;    
+  }
+
+  sycl::ext::oneapi::experimental::graph g;
+
+  q.begin_capture(&g);
+  
+  std::for_each(make_device_policy(q), data, data + array_size, [](int& d){
+    d = d + 1;
+  });
+ 
+  for (int i = 0; i < array_size; ++i) {
+    EXPECT_EQ(initial_value, data[i])
+    << "Error: PSTL executes in the capture window.";    
+  }
+  
+  q.end_capture();
+  
+  auto exec_graph = g.instantiate(q);
+
+  q.submit(exec_graph);
+    
+  for (int i = 0; i < array_size; i++) {
+    EXPECT_EQ(initial_value + 1, data[i]) 
+    << "Execution Error: data[" << i << "] != " << initial_value+1;
+  }
+  
+  free(data, q);
+}
+
+
+// two PSTLs in the capture window
+TEST(CaptureGraph, two_PSTLs) {
+  sycl::property_list properties{
+    sycl::property::queue::in_order(),
+    sycl::ext::oneapi::property::queue::capture_mode{}};
+  
+  sycl::default_selector device_selector;
+  
+  sycl::queue q(device_selector, properties);
+    
+  int* data = sycl::malloc_shared<int>(array_size, q);
+  
+  for (int i = 0; i < array_size; i++) {
+    data[i] = initial_value;
+  }
+  
+  for (int i = 0; i < array_size; ++i) {
+    EXPECT_EQ(initial_value, data[i])
+    << "Initialize Error: data[" << i << "] != " << initial_value;    
+  }
+
+  sycl::ext::oneapi::experimental::graph g;
+
+  q.begin_capture(&g);
+  
+  std::for_each(make_device_policy(q), data, data + array_size, [](int& d){
+    d = d + 1;
+  });
+ 
+  for (int i = 0; i < array_size; ++i) {
+    EXPECT_EQ(initial_value, data[i])
+    << "Error: PSTL executes in the capture window.";    
+  }
+  
+  std::for_each(make_device_policy(q), data, data + array_size, [](int& d){
+    d = d * 2;
+  });
+ 
+  for (int i = 0; i < array_size; ++i) {
+    EXPECT_EQ(initial_value, data[i])
+    << "Error: PSTL executes in the capture window.";    
+  }
+  
+  q.end_capture();
+  
+  auto exec_graph = g.instantiate(q);
+
+  q.submit(exec_graph);
+    
+  for (int i = 0; i < array_size; i++) {
+    EXPECT_EQ(2*(initial_value + 1), data[i]) 
+    << "Execution Error: data[" << i << "] != " << 2*(initial_value+1);
+  }
+  
+  free(data, q);
+}
+
+
+// four PSTLs in the capture window
+TEST(CaptureGraph, four_PSTLs) {
+  sycl::property_list properties{
+    sycl::property::queue::in_order(),
+    sycl::ext::oneapi::property::queue::capture_mode{}};
+  
+  sycl::default_selector device_selector;
+  
+  sycl::queue q(device_selector, properties);
+    
+  int* data = sycl::malloc_shared<int>(array_size, q);
+  
+  for (int i = 0; i < array_size; i++) {
+    data[i] = initial_value;
+  }
+  
+  for (int i = 0; i < array_size; ++i) {
+    EXPECT_EQ(initial_value, data[i])
+    << "Initialize Error: data[" << i << "] != " << initial_value;    
+  }
+
+  sycl::ext::oneapi::experimental::graph g;
+
+  q.begin_capture(&g);
+  
+  std::for_each(make_device_policy(q), data, data + array_size, [](int& d){
+    d = d + 1;
+  });
+ 
+  for (int i = 0; i < array_size; ++i) {
+    EXPECT_EQ(initial_value, data[i])
+    << "Error: PSTL executes in the capture window.";    
+  }
+  
+  std::for_each(make_device_policy(q), data, data + array_size, [](int& d){
+    d = d * 2;
+  });
+ 
+  for (int i = 0; i < array_size; ++i) {
+    EXPECT_EQ(initial_value, data[i])
+    << "Error: PSTL executes in the capture window.";    
+  }
+  
+  std::for_each(make_device_policy(q), data, data + array_size, [](int& d){
+    d = d / 3;
+  });
+ 
+  for (int i = 0; i < array_size; ++i) {
+    EXPECT_EQ(initial_value, data[i])
+    << "Error: PSTL executes in the capture window.";    
+  }
+  
+  std::for_each(make_device_policy(q), data, data + array_size, [](int& d){
+    d = d - 10;
+  });
+ 
+  for (int i = 0; i < array_size; ++i) {
+    EXPECT_EQ(initial_value, data[i])
+    << "Error: PSTL executes in the capture window.";    
+  }
+  
+  q.end_capture();
+  
+  auto exec_graph = g.instantiate(q);
+
+  q.submit(exec_graph);
+    
+  for (int i = 0; i < array_size; i++) {
+    EXPECT_EQ(2*(initial_value + 1)/3-10, data[i]) 
+    << "Execution Error: data[" << i << "] != " << 2*(initial_value+1)/3-10;
+  }
+  
+  free(data, q);
+}

From d7439b2df2e4a5c9f5fc4b4418abd6fdae51b71c Mon Sep 17 00:00:00 2001
From: u154255 <u154255@s001-n157.aidevcloud>
Date: Wed, 6 Jul 2022 12:42:23 -0700
Subject: [PATCH 32/43] added std::fill in the test

---
 sycl/examples/unittest.cpp | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/sycl/examples/unittest.cpp b/sycl/examples/unittest.cpp
index 5a3aa749f8d40..098f1db6cc54e 100644
--- a/sycl/examples/unittest.cpp
+++ b/sycl/examples/unittest.cpp
@@ -258,8 +258,8 @@ TEST(CaptureGraph, one_submit) {
  * The followings capture external libraries
  */
 
-// one PSTL in the capture window
-TEST(CaptureGraph, one_PSTL) {
+// one PSTL - for_each in the capture window
+TEST(CaptureGraph, one_PSTL_for_each) {
   sycl::property_list properties{
     sycl::property::queue::in_order(),
     sycl::ext::oneapi::property::queue::capture_mode{}};
@@ -307,7 +307,7 @@ TEST(CaptureGraph, one_PSTL) {
 }
 
 
-// two PSTLs in the capture window
+// two PSTLs - "for_each and fill" in the capture window
 TEST(CaptureGraph, two_PSTLs) {
   sycl::property_list properties{
     sycl::property::queue::in_order(),
@@ -332,9 +332,7 @@ TEST(CaptureGraph, two_PSTLs) {
 
   q.begin_capture(&g);
   
-  std::for_each(make_device_policy(q), data, data + array_size, [](int& d){
-    d = d + 1;
-  });
+  std::fill(make_device_policy(q), data, data + array_size, initial_value * 2);
  
   for (int i = 0; i < array_size; ++i) {
     EXPECT_EQ(initial_value, data[i])
@@ -342,7 +340,7 @@ TEST(CaptureGraph, two_PSTLs) {
   }
   
   std::for_each(make_device_policy(q), data, data + array_size, [](int& d){
-    d = d * 2;
+    d = d + 1;
   });
  
   for (int i = 0; i < array_size; ++i) {
@@ -357,8 +355,8 @@ TEST(CaptureGraph, two_PSTLs) {
   q.submit(exec_graph);
     
   for (int i = 0; i < array_size; i++) {
-    EXPECT_EQ(2*(initial_value + 1), data[i]) 
-    << "Execution Error: data[" << i << "] != " << 2*(initial_value+1);
+    EXPECT_EQ(2*initial_value + 1, data[i]) 
+    << "Execution Error: data[" << i << "] != " << 2*initial_value+1;
   }
   
   free(data, q);
@@ -387,12 +385,10 @@ TEST(CaptureGraph, four_PSTLs) {
   }
 
   sycl::ext::oneapi::experimental::graph g;
-
+  
   q.begin_capture(&g);
   
-  std::for_each(make_device_policy(q), data, data + array_size, [](int& d){
-    d = d + 1;
-  });
+  std::fill(make_device_policy(q), data, data + array_size, initial_value*2);
  
   for (int i = 0; i < array_size; ++i) {
     EXPECT_EQ(initial_value, data[i])
@@ -400,7 +396,7 @@ TEST(CaptureGraph, four_PSTLs) {
   }
   
   std::for_each(make_device_policy(q), data, data + array_size, [](int& d){
-    d = d * 2;
+    d = d + 1;
   });
  
   for (int i = 0; i < array_size; ++i) {
@@ -408,9 +404,7 @@ TEST(CaptureGraph, four_PSTLs) {
     << "Error: PSTL executes in the capture window.";    
   }
   
-  std::for_each(make_device_policy(q), data, data + array_size, [](int& d){
-    d = d / 3;
-  });
+  std::fill(make_device_policy(q), data, data + array_size, 10);
  
   for (int i = 0; i < array_size; ++i) {
     EXPECT_EQ(initial_value, data[i])
@@ -433,8 +427,8 @@ TEST(CaptureGraph, four_PSTLs) {
   q.submit(exec_graph);
     
   for (int i = 0; i < array_size; i++) {
-    EXPECT_EQ(2*(initial_value + 1)/3-10, data[i]) 
-    << "Execution Error: data[" << i << "] != " << 2*(initial_value+1)/3-10;
+    EXPECT_EQ(0, data[i]) 
+    << "Execution Error: data[" << i << "] != " << "0";
   }
   
   free(data, q);

From 6ecbf93b01029517be6c7c030ae2e2a06efdfcbc Mon Sep 17 00:00:00 2001
From: cheng-hsiang-chiu <cheng-hsiang.chiu@intel.com>
Date: Wed, 10 Aug 2022 07:15:46 -0700
Subject: [PATCH 33/43] added APIs for capture mode use

---
 sycl/include/CL/sycl/event.hpp | 29 +++++++++++++++++++++++++++++
 sycl/source/event.cpp          | 29 +++++++++++++++++++++++++++--
 2 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/sycl/include/CL/sycl/event.hpp b/sycl/include/CL/sycl/event.hpp
index 98b85f20c4f80..6e5726132cba2 100644
--- a/sycl/include/CL/sycl/event.hpp
+++ b/sycl/include/CL/sycl/event.hpp
@@ -136,7 +136,36 @@ class __SYCL_EXPORT event {
     return reinterpret_cast<backend_return_t<Backend, event>>(getNative());
   }
 
+
+  /// Sets the in_capture variable 
+  ///
+  /// Sets the in_capture variable to be true 
+  /// if the event is created in the 
+  /// capture window. 
+  void set_in_capture(const bool);
+
+  /// Gets the in_capture variable
+  ///
+  /// \return the in_capture variable
+  bool get_in_capture() const;
+
+  /// Updates the id of an event
+  void set_id(const size_t);
+  
+  /// Gets the eid of an event
+  ///
+  /// \return the eid of an event
+  size_t get_id() const;
+
 private:
+  /// the variable of an event 
+  /// denoting the status of inside or outside the capture window
+  /// default variable is false
+  bool in_capture = false;
+
+  /// the unique id of an event
+  size_t eid = 0;
+
   event(std::shared_ptr<detail::event_impl> EventImpl);
 
   pi_native_handle getNative() const;
diff --git a/sycl/source/event.cpp b/sycl/source/event.cpp
index ef084f219e899..90c799ce46d26 100644
--- a/sycl/source/event.cpp
+++ b/sycl/source/event.cpp
@@ -22,6 +22,14 @@
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 
+bool event::get_in_capture() const { return in_capture; }
+
+void event::set_in_capture(const bool ic) { in_capture  = ic; }
+
+void event::set_id(const size_t id) { eid = id; }
+
+size_t event::get_id() const { return eid; }
+
 event::event() : impl(std::make_shared<detail::event_impl>()) {}
 
 event::event(cl_event ClEvent, const context &SyclContext)
@@ -36,7 +44,14 @@ cl_event event::get() const { return impl->get(); }
 
 bool event::is_host() const { return impl->is_host(); }
 
-void event::wait() { impl->wait(impl); }
+void event::wait() {
+  std::cout << "event::wait()\n";
+
+  // if event::wait() is not called inside the capture window.
+  if (!get_in_capture()) {
+    impl->wait(impl); 
+  }
+}
 
 void event::wait(const std::vector<event> &EventList) {
   for (auto E : EventList) {
@@ -44,9 +59,19 @@ void event::wait(const std::vector<event> &EventList) {
   }
 }
 
-void event::wait_and_throw() { impl->wait_and_throw(impl); }
+void event::wait_and_throw() {
+  std::cout << "wait and throw option 5 and in_capture = "
+            << get_in_capture() << "\n";
+  // directly returns if the event.wait_and_throw() 
+  // is called inside the capture window
+  if (get_in_capture()) {
+    return;
+  }
+  impl->wait_and_throw(impl); 
+}
 
 void event::wait_and_throw(const std::vector<event> &EventList) {
+  std::cout << "wait and throw option 6\n";
   for (auto E : EventList) {
     E.wait_and_throw();
   }

From 0659d3cf96206f8f8d8e5878373fab0a84f86ae7 Mon Sep 17 00:00:00 2001
From: cheng-hsiang-chiu <cheng-hsiang.chiu@intel.com>
Date: Wed, 10 Aug 2022 07:34:19 -0700
Subject: [PATCH 34/43] If neither lazy exec nor capture mode

---
 sycl/plugins/level_zero/pi_level_zero.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index e73dbbdadda90..7a5b22143c4ca 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -4948,7 +4948,6 @@ piKernelLaunch(pi_queue Queue) {
               Queue, CommandList, false /* PreferCopyEngine */,
               true /* AllowBatching */, Graph /* Shortcut for Graph */))
         return Res;
-    
     // Execute command list asynchronously, as the event will be used
     // to track down its completion.
     if (auto Res = Queue->executeCommandList(CommandList, false, true, Graph))

From c3f492fcbe3aa70745043a61b46331c6e9eeaa3b Mon Sep 17 00:00:00 2001
From: cheng-hsiang-chiu <cheng-hsiang.chiu@intel.com>
Date: Wed, 10 Aug 2022 07:49:05 -0700
Subject: [PATCH 35/43] Added in_capture variable in queue_impl::wait()

---
 sycl/source/detail/queue_impl.cpp | 15 ++++++++++++---
 sycl/source/detail/queue_impl.hpp |  4 +++-
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index ebe3b328a99b0..2e8bd9a1a07b6 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -264,7 +264,7 @@ void queue_impl::instrumentationEpilog(void *TelemetryEvent, std::string &Name,
 #endif
 }
 
-void queue_impl::wait(const detail::code_location &CodeLoc) {
+void queue_impl::wait(const detail::code_location &CodeLoc, const bool in_capture) {
   (void)CodeLoc;
 #ifdef XPTI_ENABLE_INSTRUMENTATION
   void *TelemetryEvent = nullptr;
@@ -275,12 +275,21 @@ void queue_impl::wait(const detail::code_location &CodeLoc) {
 #endif
 
 #if 1
-  if(has_property<ext::oneapi::property::queue::lazy_execution>() ||
-     has_property<ext::oneapi::property::queue::capture_mode>()){
+  if(has_property<ext::oneapi::property::queue::lazy_execution>()) {
     const detail::plugin &Plugin = getPlugin();
     if (Plugin.getBackend() == backend::ext_oneapi_level_zero)
         Plugin.call<detail::PiApiKind::piKernelLaunch>(getHandleRef());
   }
+
+  if(has_property<ext::oneapi::property::queue::capture_mode>()){
+    if(in_capture) { 
+      return;
+    }
+    const detail::plugin &Plugin = getPlugin();
+    if (Plugin.getBackend() == backend::ext_oneapi_level_zero)
+        Plugin.call<detail::PiApiKind::piKernelLaunch>(getHandleRef());
+  }
+
 #endif
 
   std::vector<std::weak_ptr<event_impl>> WeakEvents;
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index 9288073ff6774..70e0414d91a0d 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -254,13 +254,15 @@ class queue_impl {
   ///
   /// Synchronous errors will be reported through SYCL exceptions.
   /// @param Loc is the code location of the submit call (default argument)
-  void wait(const detail::code_location &Loc = {});
+  /// @param in_caputure denotes the status of being in the capture window or not
+  void wait(const detail::code_location &Loc = {}, const bool in_capture=false);
 
   /// \return list of asynchronous exceptions occurred during execution.
   exception_list getExceptionList() const { return MExceptions; }
 
   /// @param Loc is the code location of the submit call (default argument)
   void wait_and_throw(const detail::code_location &Loc = {}) {
+    //std::cout << "wait and throw option 4\n";
     wait(Loc);
     throw_asynchronous();
   }

From 16c99044653f949a55ccadce16616a4e93d5c63a Mon Sep 17 00:00:00 2001
From: cheng-hsiang-chiu <cheng-hsiang.chiu@intel.com>
Date: Wed, 10 Aug 2022 07:53:11 -0700
Subject: [PATCH 36/43] Added two APIs query is_lazy() and is_capture()

---
 sycl/source/queue.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp
index 95db4c67068c9..381a732049d1d 100644
--- a/sycl/source/queue.cpp
+++ b/sycl/source/queue.cpp
@@ -156,7 +156,7 @@ event queue::submit_impl_and_postprocess(
 }
 
 void queue::wait_proxy(const detail::code_location &CodeLoc) {
-  impl->wait(CodeLoc);
+  impl->wait(CodeLoc, in_capture);
 }
 
 void queue::wait_and_throw_proxy(const detail::code_location &CodeLoc) {

From 14d691d45cda002573f0816fc017cdd5fea8b988 Mon Sep 17 00:00:00 2001
From: cheng-hsiang-chiu <cheng-hsiang.chiu@intel.com>
Date: Wed, 10 Aug 2022 08:05:51 -0700
Subject: [PATCH 37/43] Added and Modified APIs

---
 .../sycl/ext/oneapi/experimental/graph.hpp    | 60 ++++++++++++++++---
 1 file changed, 52 insertions(+), 8 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
index 2decd10e39044..a945c90d0c0ac 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
@@ -48,6 +48,9 @@ struct node_impl {
   bool is_scheduled;
   bool is_empty;
 
+  size_t nid = 0;
+  inline void set_nid(const size_t id) { nid = id; }
+
   graph_ptr my_graph;
   sycl::event my_event;
 
@@ -57,6 +60,7 @@ struct node_impl {
   std::function<void(sycl::handler &)> my_body;
 
   inline void exec(sycl::queue q) {
+    //std::cout << "node_imple exec\n";
     std::vector<sycl::event> __deps;
     std::vector<node_ptr> pred_nodes = my_predecessors;
     while (!pred_nodes.empty()) {
@@ -69,8 +73,10 @@ struct node_impl {
       else
         __deps.push_back(curr_node->get_event());
     }
-    if (my_body && !is_empty)
+    if (my_body && !is_empty) {
+      //std::cout << "node_impl.exec, q.submit \n";
       my_event = q.submit(wrapper{my_body, __deps});
+    }
   }
 
   inline void register_successor(node_ptr n) {
@@ -108,6 +114,7 @@ struct graph_impl {
   graph_ptr parent;
 
   inline void exec(sycl::queue q) {
+    //std::cout << "graph_impl, exec\n";
     if (my_schedule.empty()) {
       for (auto n : my_roots) {
         n->topology_sort(my_schedule);
@@ -118,6 +125,7 @@ struct graph_impl {
   }
 
   inline void exec_and_wait(sycl::queue q) {
+    //std::cout << "graph_impl, exec_and_wait\n";
     exec(q);
     q.wait();
   }
@@ -167,7 +175,9 @@ struct node {
   }
 
   inline void register_successor(node n) { my_node->register_successor(n.my_node); }
-  inline void exec(sycl::queue q, sycl::event = sycl::event()) { my_node->exec(q); }
+  inline void exec(sycl::queue q, sycl::event = sycl::event()) {
+    //std::cout << "node, exec\n";
+    my_node->exec(q); }
 
   inline void set_root() { my_graph->add_root(my_node); }
 
@@ -183,6 +193,7 @@ class executable_graph {
 
   executable_graph(detail::graph_ptr g, sycl::queue q)
       : my_tag(rand()), my_queue(q) {
+    //std::cout << "executable_graph constructor\n";
     g->exec(my_queue);
   }
 };
@@ -342,6 +353,7 @@ class graph {
   void exec_and_wait(sycl::queue q);
 
   inline executable_graph instantiate(sycl::queue q) {
+    //std::cout << "executable_graph::instantiate \n";
     return executable_graph{my_graph, q};
   };
 
@@ -354,13 +366,23 @@ class graph {
 
   size_t num_nodes() const;
   size_t num_edges() const;
+  size_t uid = 0;
+  size_t get_id() const;
+  void set_id(const size_t);
+
+  std::map<size_t, node> id2node;
+
+  detail::node_ptr locate_node(const size_t);
 
 private:
   detail::graph_ptr my_graph;
   detail::node_ptr ptr_prev_node;
 };
 
-inline void executable_graph::exec_and_wait() { my_queue.wait(); }
+inline void executable_graph::exec_and_wait() {
+  //std::cout << "executable_graph::exec_and_wait() \n";
+  my_queue.wait();
+}
 
 /// Adds a node to the graph, in order to be executed upon graph execution.
 ///
@@ -381,16 +403,16 @@ inline node graph::add_node(T cgf, const std::vector<node> &dep, const bool capt
   else {
     // first node ever
     if (!ptr_prev_node) {
-      std::cout << "ptr_prev_node = " << ptr_prev_node << '\n';
+      //std::cout << "first node is null, ptr_prev_node = " << ptr_prev_node << '\n';
       _node.set_root();
       ptr_prev_node = _node.my_node;
-      std::cout << "finish if\n";
+      //std::cout << "finish if\n";
     }
     else {
-      std::cout << "ptr_prev_node = " << ptr_prev_node << '\n';
+      //std::cout << "first node exists, ptr_prev_node = " << ptr_prev_node << '\n';
       ptr_prev_node->register_successor(_node.my_node);
       ptr_prev_node = _node.my_node;
-      std::cout << "finish else\n";
+      //std::cout << "finish else\n";
     }
   }
   return _node;
@@ -722,7 +744,7 @@ inline void graph::single_task(node &Node, const KernelType &(KernelFunc),
 template <typename KernelName, typename KernelType>
 inline node graph::parallel_for(range<1> NumWorkItems, const KernelType &(KernelFunc),
                          const std::vector<node> &dep) {
-  std::cout << "in graph, use this parallel_for\n";
+  //std::cout << "in graph, use this parallel_for\n";
   return graph::add_node(
       [=](sycl::handler &h) {
         h.template parallel_for<KernelName, KernelType>(NumWorkItems,
@@ -1065,6 +1087,28 @@ inline size_t graph::num_edges() const {
   return num_edges;
 }
 
+inline size_t graph::get_id() const {
+  return uid;
+}
+
+inline void graph::set_id(const size_t id) {
+  uid = id;
+}
+
+inline detail::node_ptr graph::locate_node(const size_t id) {
+  for (auto root : my_graph->my_roots) {
+    if (root->nid == id) {
+      return root;
+    }
+    else {
+      for (auto successor : root->my_successors) {
+        if (successor->nid == id) {
+          return successor;
+        }
+      }
+    }
+  }
+}
 
 } // namespace experimental
 } // namespace oneapi

From da762c0e7d1a4558ce30dbb00c28c77471c12091 Mon Sep 17 00:00:00 2001
From: cheng-hsiang-chiu <cheng-hsiang.chiu@intel.com>
Date: Wed, 10 Aug 2022 08:07:51 -0700
Subject: [PATCH 38/43] Added comments

---
 .../sycl/ext/oneapi/experimental/graph.hpp    | 23 +++++++++----------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
index a945c90d0c0ac..9c00f9a33a5f1 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
@@ -364,12 +364,22 @@ class graph {
 
   bool is_subgraph();
 
+  // Returns the number of nodes in the graph
   size_t num_nodes() const;
+
+  // Returns the number of edges in the graph
   size_t num_edges() const;
+
+  // The unique id of a node
   size_t uid = 0;
+
+  // Returns the uid of a node
   size_t get_id() const;
+
+  // Updates the uid of a node
   void set_id(const size_t);
 
+  // The map between event id and node id
   std::map<size_t, node> id2node;
 
   detail::node_ptr locate_node(const size_t);
@@ -388,6 +398,7 @@ inline void executable_graph::exec_and_wait() {
 ///
 /// \param cgf is a function object containing command group.
 /// \param dep is a vector of graph nodes the to be added node depends on.
+/// \param capture is a variable denoting if being in a capture mode.
 /// \return a graph node representing the command group operation.
 template <typename T>
 inline node graph::add_node(T cgf, const std::vector<node> &dep, const bool capture) {
@@ -1115,17 +1126,5 @@ inline detail::node_ptr graph::locate_node(const size_t id) {
 } // namespace ext
 
 
-//void sycl::queue::submit(sycl::ext::oneapi::experimental::executable_graph& g) {
-//  g.exec_and_wait();
-//}
-
-//void sycl::queue::begin_capture(sycl::ext::oneapi::experimental::graph* ptr) {
-//  my_graph_ptr = ptr;
-//}
-//
-//void sycl::queue::end_capture() const {}
-//
-
-
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)

From 7f1ef1d1cf69cfdb6df8f1f8077fbdca9ef0b4a9 Mon Sep 17 00:00:00 2001
From: cheng-hsiang-chiu <cheng-hsiang.chiu@intel.com>
Date: Wed, 10 Aug 2022 11:11:13 -0700
Subject: [PATCH 39/43] Overloaded parallel_for()

---
 sycl/include/CL/sycl/queue.hpp | 950 ++++++++++++++++++++++++++++++---
 1 file changed, 890 insertions(+), 60 deletions(-)

diff --git a/sycl/include/CL/sycl/queue.hpp b/sycl/include/CL/sycl/queue.hpp
index 2c2b006f49275..c14ba5f3cb36e 100644
--- a/sycl/include/CL/sycl/queue.hpp
+++ b/sycl/include/CL/sycl/queue.hpp
@@ -252,53 +252,249 @@ class __SYCL_EXPORT queue {
 
 public:
 
+  // A pointer to the graph object
   sycl::ext::oneapi::experimental::graph* my_graph_ptr;
-  void submit(sycl::ext::oneapi::experimental::executable_graph&);
-  void begin_capture(sycl::ext::oneapi::experimental::graph*);
-  void end_capture() const;
+
+  // A boolean denotes the queue of being in capture mode or not
+  bool in_capture = false;
+
+  // A boolean denotes a node is created in the graph for the first time
+  bool first_graph_submission = true;
+  
+  /// Submits an executable graph for execution
+  /// \param g is an executable graph
+  void submit(sycl::ext::oneapi::experimental::executable_graph& g);
+
+  /// Marks the beginning of the capture window
+  /// \param g is a pointer to a graph object
+  void begin_capture(sycl::ext::oneapi::experimental::graph* g);
   
+  /// Marks the end of the capture window
+  void end_capture();
+  
+  /// parallel_for version with a kernel represented as a lambda + range that
+  /// specifies global size only.
+  ///
+  /// \param NumWorkItems is a range that specifies the work space of the kernel
+  /// \param KernelFunc is the Kernel functor or lambda
+  /// \return a SYCL event object for the submitted kernel.
   template <typename KernelName = detail::auto_name, typename KernelType>
   event parallel_for(range<1> NumWorkItems, const KernelType& KernelFunc);
 
+  /// parallel_for version with a kernel represented as a lambda + range that
+  /// specifies global size only.
+  ///
+  /// \param NumWorkItems is a range that specifies the work space of the kernel
+  /// \param DepEvent is an event that specifies the kernel dependencies
+  /// \param KernelFunc is the Kernel functor or lambda
+  /// \return a SYCL event object for the submitted kernel.
   template <typename KernelName = detail::auto_name, typename KernelType>
   event parallel_for(range<1> NumWorkItems, event DepEvent, const KernelType& KernelFunc);
 
+  /// parallel_for version with a kernel represented as a lambda + range that
+  /// specifies global size only.
+  ///
+  /// \param NumWorkItems is a range that specifies the work space of the kernel
+  /// \param DepEvents is a vector of events that specifies the kernel
+  /// dependencies
+  /// \param KernelFunc is the Kernel functor or lambda
+  /// \return a SYCL event object for the submitted kernel.
   template <typename KernelName = detail::auto_name, typename KernelType>
   event parallel_for(range<1> NumWorkItems, const std::vector<event>& DepEvents, const KernelType& KernelFunc);
   
+  /// parallel_for version with a kernel represented as a lambda + range that
+  /// specifies global size only.
+  ///
+  /// \param NumWorkItems is a range that specifies the work space of the kernel
+  /// \param KernelFunc is the Kernel functor or lambda
+  /// \return a SYCL event object for the submitted kernel.
+  template <typename KernelName = detail::auto_name, typename KernelType>
+  event parallel_for(range<2> NumWorkItems, const KernelType& KernelFunc);
+  
+  /// parallel_for version with a kernel represented as a lambda + range that
+  /// specifies global size only.
+  ///
+  /// \param NumWorkItems is a range that specifies the work space of the kernel
+  /// \param DepEvent is an event that specifies the kernel dependencies
+  /// \param KernelFunc is the Kernel functor or lambda
+  /// \return a SYCL event object for the submitted kernel.
+  template <typename KernelName = detail::auto_name, typename KernelType>
+  event parallel_for(range<2> NumWorkItems, event DepEvent, const KernelType& KernelFunc);
+
+  /// parallel_for version with a kernel represented as a lambda + range that
+  /// specifies global size only.
+  ///
+  /// \param NumWorkItems is a range that specifies the work space of the kernel
+  /// \param DepEvents is a vector of events that specifies the kernel
+  /// dependencies
+  /// \param KernelFunc is the Kernel functor or lambda
+  /// \return a SYCL event object for the submitted kernel.
+  template <typename KernelName = detail::auto_name, typename KernelType>
+  event parallel_for(range<2> NumWorkItems, const std::vector<event>& DepEvents, const KernelType& KernelFunc);
+  
+  /// parallel_for version with a kernel represented as a lambda + range that
+  /// specifies global size only.
+  ///
+  /// \param NumWorkItems is a range that specifies the work space of the kernel
+  /// \param KernelFunc is the Kernel functor or lambda
+  /// \return a SYCL event object for the submitted kernel.
+  template <typename KernelName = detail::auto_name, typename KernelType>
+  event parallel_for(range<3> NumWorkItems, const KernelType& KernelFunc);
+ 
+  /// parallel_for version with a kernel represented as a lambda + range that
+  /// specifies global size only.
+  ///
+  /// \param NumWorkItems is a range that specifies the work space of the kernel
+  /// \param DepEvent is an event that specifies the kernel dependencies
+  /// \param KernelFunc is the Kernel functor or lambda
+  /// \return a SYCL event object for the submitted kernel.
+  template <typename KernelName = detail::auto_name, typename KernelType>
+  event parallel_for(range<3> NumWorkItems, event DepEvent, const KernelType& KernelFunc);
+
+  /// parallel_for version with a kernel represented as a lambda + range that
+  /// specifies global size only.
+  ///
+  /// \param NumWorkItems is a range that specifies the work space of the kernel
+  /// \param DepEvents is a vector of events that specifies the kernel
+  /// dependencies
+  /// \param KernelFunc is the Kernel functor or lambda
+  /// \return a SYCL event object for the submitted kernel.
+  template <typename KernelName = detail::auto_name, typename KernelType>
+  event parallel_for(range<3> NumWorkItems, const std::vector<event>& DepEvents, const KernelType& KernelFunc);
+
+  /// parallel_for version with a kernel represented as a lambda + nd_range that
+  /// specifies global, local sizes and offset.
+  ///
+  /// \param ExecutionRange is a range that specifies the work space of the
+  /// kernel
+  /// \param KernelFunc is the Kernel functor or lambda
+  /// \return a SYCL event object for the submitted kernel.
+  template <typename KernelName = detail::auto_name, typename KernelType,
+            int Dims>
+  event parallel_for(nd_range<Dims> ExecutionRange, const KernelType& KernelFunc);
+  
+  /// parallel_for version with a kernel represented as a lambda + nd_range that
+  /// specifies global, local sizes and offset.
+  ///
+  /// \param ExecutionRange is a range that specifies the work space of the
+  /// kernel
+  /// \param DepEvent is an event that specifies the kernel dependencies
+  /// \param KernelFunc is the Kernel functor or lambda
+  /// \return a SYCL event object for the submitted kernel.
+  template <typename KernelName = detail::auto_name, typename KernelType,
+            int Dims>
+  event parallel_for(nd_range<Dims> ExecutionRange, event DepEvent, const KernelType& KernelFunc);
+  
+  /// parallel_for version with a kernel represented as a lambda + nd_range that
+  /// specifies global, local sizes and offset.
+  ///
+  /// \param ExecutionRange is a range that specifies the work space of the
+  /// kernel
+  /// \param DepEvents is a vector of events that specifies the kernel
+  /// dependencies
+  /// \param KernelFunc is the Kernel functor or lambda
+  /// \return a SYCL event object for the submitted kernel.
+  template <typename KernelName = detail::auto_name, typename KernelType,
+            int Dims>
+  event parallel_for(nd_range<Dims> ExecutionRange, const std::vector<event>& DepEvents, const KernelType& KernelFunc);
+  
+  /// parallel_for version with a kernel represented as a lambda + nd_range that
+  /// specifies global, local sizes and offset.
+  ///
+  /// \param ExecutionRange is a range that specifies the work space of the
+  /// kernel
+  /// \param Redu is a reduction operation
+  /// \param KernelFunc is the Kernel functor or lambda
+  /// \return a SYCL event object for the submitted kernel.
+  template <typename KernelName = detail::auto_name, typename KernelType,
+            int Dims, typename Reduction>
+  event parallel_for(nd_range<Dims> ExecutionRange, Reduction Redu, const KernelType& KernelFunc);
+  
+  /// parallel_for version with a kernel represented as a lambda + range and
+  /// offset that specify global size and global offset correspondingly.
+  ///
+  /// \param NumWorkItems is a range that specifies the work space of the kernel
+  /// \param WorkItemOffset specifies the offset for each work item id
+  /// \param KernelFunc is the Kernel functor or lambda
+  /// \return a SYCL event object for the submitted kernel.
+  template <typename KernelName = detail::auto_name, typename KernelType,
+            int Dims>
+  event parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset, const KernelType& KernelFunc);
+  
+  /// parallel_for version with a kernel represented as a lambda + range and
+  /// offset that specify global size and global offset correspondingly.
+  ///
+  /// \param NumWorkItems is a range that specifies the work space of the kernel
+  /// \param WorkItemOffset specifies the offset for each work item id
+  /// \param DepEvent is an event that specifies the kernel dependencies
+  /// \param KernelFunc is the Kernel functor or lambda
+  /// \return a SYCL event object for the submitted kernel.
+  template <typename KernelName = detail::auto_name, typename KernelType,
+            int Dims>
+  event parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
+                     event DepEvent, const KernelType& KernelFunc);
+  
+  /// parallel_for version with a kernel represented as a lambda + range and
+  /// offset that specify global size and global offset correspondingly.
+  ///
+  /// \param NumWorkItems is a range that specifies the work space of the kernel
+  /// \param WorkItemOffset specifies the offset for each work item id
+  /// \param DepEvents is a vector of events that specifies the kernel
+  /// dependencies
+  /// \param KernelFunc is the Kernel functor or lambda
+  /// \return a SYCL event object for the submitted kernel.
+  template <typename KernelName = detail::auto_name, typename KernelType,
+            int Dims>
+  event parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
+                     const std::vector<event> &DepEvents,
+                     const KernelType& KernelFunc);
+  
   /// Submits a command group function object to the queue, in order to be
   /// scheduled for execution on the device.
   ///
   /// \param CGF is a function object containing command group.
   /// \param CodeLoc is the code location of the submit call (default argument)
   /// \return a SYCL event object for the submitted command group.
-  template <typename T> event submit(T CGF _CODELOCPARAM(&CodeLoc)) {
-    _CODELOCARG(&CodeLoc);
-
-#if __SYCL_USE_FALLBACK_ASSERT
-    if (!is_host()) {
-      auto PostProcess = [this, &CodeLoc](bool IsKernel, bool KernelUsesAssert,
-                                          event &E) {
-        if (IsKernel && !device_has(aspect::ext_oneapi_native_assert) &&
-            KernelUsesAssert && !device_has(aspect::accelerator)) {
-          // __devicelib_assert_fail isn't supported by Device-side Runtime
-          // Linking against fallback impl of __devicelib_assert_fail is
-          // performed by program manager class
-          // Fallback assert isn't supported for FPGA
-          submitAssertCapture(*this, E, /* SecondaryQueue = */ nullptr,
-                              CodeLoc);
-        }
-      };
-
-      auto Event = submit_impl_and_postprocess(CGF, CodeLoc, PostProcess);
-      return discard_or_return(Event);
-    } else
-#endif // __SYCL_USE_FALLBACK_ASSERT
-    {
-      auto Event = submit_impl(CGF, CodeLoc);
-      return discard_or_return(Event);
-    }
-  }
+  template <typename T> 
+  event submit(
+    T CGF,
+    const detail::code_location &CodeLoc = detail::code_location::current());
+  
+  
+  /// Submits a command group function object to the queue, in order to be
+  /// scheduled for execution on the device.
+  ///
+  /// \param CGF is a function object containing command group.
+  /// \param CodeLoc is the code location of the submit call (default argument)
+  /// \return a SYCL event object for the submitted command group.
+//  template <typename T> event submit(T CGF _CODELOCPARAM(&CodeLoc)) {
+//    _CODELOCARG(&CodeLoc);
+//
+//#if __SYCL_USE_FALLBACK_ASSERT
+//    if (!is_host()) {
+//      auto PostProcess = [this, &CodeLoc](bool IsKernel, bool KernelUsesAssert,
+//                                          event &E) {
+//        if (IsKernel && !device_has(aspect::ext_oneapi_native_assert) &&
+//            KernelUsesAssert && !device_has(aspect::accelerator)) {
+//          // __devicelib_assert_fail isn't supported by Device-side Runtime
+//          // Linking against fallback impl of __devicelib_assert_fail is
+//          // performed by program manager class
+//          // Fallback assert isn't supported for FPGA
+//          submitAssertCapture(*this, E, /* SecondaryQueue = */ nullptr,
+//                              CodeLoc);
+//        }
+//      };
+//
+//      auto Event = submit_impl_and_postprocess(CGF, CodeLoc, PostProcess);
+//      return discard_or_return(Event);
+//    } else
+//#endif // __SYCL_USE_FALLBACK_ASSERT
+//    {
+//      auto Event = submit_impl(CGF, CodeLoc);
+//      return discard_or_return(Event);
+//    }
+//  }
+//  
 
   /// Submits a command group function object to the queue, in order to be
   /// scheduled for execution on the device.
@@ -409,6 +605,7 @@ class __SYCL_EXPORT queue {
   /// Synchronous errors will be reported through SYCL exceptions.
   /// @param CodeLoc is the code location of the submit call (default argument)
   void wait(_CODELOCONLYPARAM(&CodeLoc)) {
+    //std::cout << "queue.hpp, queue.wait() \n";
     _CODELOCARG(&CodeLoc);
 
     wait_proxy(CodeLoc);
@@ -424,7 +621,7 @@ class __SYCL_EXPORT queue {
   /// @param CodeLoc is the code location of the submit call (default argument)
   void wait_and_throw(_CODELOCONLYPARAM(&CodeLoc)) {
     _CODELOCARG(&CodeLoc);
-
+    //std::cout << "wait and throw option 2\n";
     wait_and_throw_proxy(CodeLoc);
   }
 
@@ -798,7 +995,7 @@ class __SYCL_EXPORT queue {
     //}
   }
   */
-
+  /*
   /// parallel_for version with a kernel represented as a lambda + range that
   /// specifies global size only.
   ///
@@ -809,9 +1006,11 @@ class __SYCL_EXPORT queue {
   event parallel_for(range<2> NumWorkItems,
                      _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc)) {
     _CODELOCARG(&CodeLoc);
+    //std::cout << "parallel_for range<2>\n";
     return parallel_for_impl<KernelName>(NumWorkItems, KernelFunc, CodeLoc);
   }
-
+  */
+  /*
   /// parallel_for version with a kernel represented as a lambda + range that
   /// specifies global size only.
   ///
@@ -822,9 +1021,10 @@ class __SYCL_EXPORT queue {
   event parallel_for(range<3> NumWorkItems,
                      _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc)) {
     _CODELOCARG(&CodeLoc);
+    //std::cout << "parallel_for range<3>\n";
     return parallel_for_impl<KernelName>(NumWorkItems, KernelFunc, CodeLoc);
   }
-
+  */
   /*
   /// parallel_for version with a kernel represented as a lambda + range that
   /// specifies global size only.
@@ -842,7 +1042,7 @@ class __SYCL_EXPORT queue {
                                          CodeLoc);
   }
   */
-
+  /*
   /// parallel_for version with a kernel represented as a lambda + range that
   /// specifies global size only.
   ///
@@ -854,10 +1054,12 @@ class __SYCL_EXPORT queue {
   event parallel_for(range<2> NumWorkItems, event DepEvent,
                      _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc)) {
     _CODELOCARG(&CodeLoc);
+    //std::cout << "parallel_for range<2>, one dep\n";
     return parallel_for_impl<KernelName>(NumWorkItems, DepEvent, KernelFunc,
                                          CodeLoc);
   }
-
+  */
+  /*
   /// parallel_for version with a kernel represented as a lambda + range that
   /// specifies global size only.
   ///
@@ -869,10 +1071,11 @@ class __SYCL_EXPORT queue {
   event parallel_for(range<3> NumWorkItems, event DepEvent,
                      _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc)) {
     _CODELOCARG(&CodeLoc);
+    //std::cout << "parallel_for range<3>, one dep\n";
     return parallel_for_impl<KernelName>(NumWorkItems, DepEvent, KernelFunc,
                                          CodeLoc);
   }
-
+  */
   /*
   /// parallel_for version with a kernel represented as a lambda + range that
   /// specifies global size only.
@@ -891,7 +1094,7 @@ class __SYCL_EXPORT queue {
                                          CodeLoc);
   }
   */
-
+  /*
   /// parallel_for version with a kernel represented as a lambda + range that
   /// specifies global size only.
   ///
@@ -904,10 +1107,12 @@ class __SYCL_EXPORT queue {
   event parallel_for(range<2> NumWorkItems, const std::vector<event> &DepEvents,
                      _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc)) {
     _CODELOCARG(&CodeLoc);
+    //std::cout << "parallel_for range<2>, multile deps\n";
     return parallel_for_impl<KernelName>(NumWorkItems, DepEvents, KernelFunc,
                                          CodeLoc);
   }
-
+  */
+  /*
   /// parallel_for version with a kernel represented as a lambda + range that
   /// specifies global size only.
   ///
@@ -920,10 +1125,12 @@ class __SYCL_EXPORT queue {
   event parallel_for(range<3> NumWorkItems, const std::vector<event> &DepEvents,
                      _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc)) {
     _CODELOCARG(&CodeLoc);
+    //std::cout << "parallel_for range<3>, multile deps\n";
     return parallel_for_impl<KernelName>(NumWorkItems, DepEvents, KernelFunc,
                                          CodeLoc);
   }
-
+  */
+  /*
   /// parallel_for version with a kernel represented as a lambda + range and
   /// offset that specify global size and global offset correspondingly.
   ///
@@ -936,6 +1143,7 @@ class __SYCL_EXPORT queue {
   event parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
                      _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc)) {
     _CODELOCARG(&CodeLoc);
+    //std::cout << "parallel_for range<dims>, id<dims>\n";
     return submit(
         [&](handler &CGH) {
           CGH.template parallel_for<KernelName, KernelType>(
@@ -943,7 +1151,8 @@ class __SYCL_EXPORT queue {
         },
         CodeLoc);
   }
-
+  */
+  /*
   /// parallel_for version with a kernel represented as a lambda + range and
   /// offset that specify global size and global offset correspondingly.
   ///
@@ -958,6 +1167,7 @@ class __SYCL_EXPORT queue {
                      event DepEvent,
                      _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc)) {
     _CODELOCARG(&CodeLoc);
+    //std::cout << "parallel_for range<dims>, id<dims>, one dep\n";
     return submit(
         [&](handler &CGH) {
           CGH.depends_on(DepEvent);
@@ -966,7 +1176,8 @@ class __SYCL_EXPORT queue {
         },
         CodeLoc);
   }
-
+  */
+  /*
   /// parallel_for version with a kernel represented as a lambda + range and
   /// offset that specify global size and global offset correspondingly.
   ///
@@ -982,6 +1193,7 @@ class __SYCL_EXPORT queue {
                      const std::vector<event> &DepEvents,
                      _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc)) {
     _CODELOCARG(&CodeLoc);
+    //std::cout << "parallel_for range<dims>, id<dims>, multiple deps\n";
     return submit(
         [&](handler &CGH) {
           CGH.depends_on(DepEvents);
@@ -990,7 +1202,8 @@ class __SYCL_EXPORT queue {
         },
         CodeLoc);
   }
-
+  */
+  /*
   /// parallel_for version with a kernel represented as a lambda + nd_range that
   /// specifies global, local sizes and offset.
   ///
@@ -1003,6 +1216,7 @@ class __SYCL_EXPORT queue {
   event parallel_for(nd_range<Dims> ExecutionRange,
                      _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc)) {
     _CODELOCARG(&CodeLoc);
+    //std::cout << "parallel_for nd_range<dims>\n";
     return submit(
         [&](handler &CGH) {
           CGH.template parallel_for<KernelName, KernelType>(ExecutionRange,
@@ -1010,7 +1224,8 @@ class __SYCL_EXPORT queue {
         },
         CodeLoc);
   }
-
+  */
+  /*
   /// parallel_for version with a kernel represented as a lambda + nd_range that
   /// specifies global, local sizes and offset.
   ///
@@ -1024,6 +1239,7 @@ class __SYCL_EXPORT queue {
   event parallel_for(nd_range<Dims> ExecutionRange, event DepEvent,
                      _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc)) {
     _CODELOCARG(&CodeLoc);
+    //std::cout << "parallel_for nd_range<dims>, one dep\n";
     return submit(
         [&](handler &CGH) {
           CGH.depends_on(DepEvent);
@@ -1032,7 +1248,8 @@ class __SYCL_EXPORT queue {
         },
         CodeLoc);
   }
-
+  */
+  /*
   /// parallel_for version with a kernel represented as a lambda + nd_range that
   /// specifies global, local sizes and offset.
   ///
@@ -1048,6 +1265,7 @@ class __SYCL_EXPORT queue {
                      const std::vector<event> &DepEvents,
                      _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc)) {
     _CODELOCARG(&CodeLoc);
+    //std::cout << "parallel_for nd_range<dims>, multiple deps\n";
     return submit(
         [&](handler &CGH) {
           CGH.depends_on(DepEvents);
@@ -1056,7 +1274,8 @@ class __SYCL_EXPORT queue {
         },
         CodeLoc);
   }
-
+  */
+  /*
   /// parallel_for version with a kernel represented as a lambda + nd_range that
   /// specifies global, local sizes and offset.
   ///
@@ -1070,6 +1289,7 @@ class __SYCL_EXPORT queue {
   event parallel_for(nd_range<Dims> ExecutionRange, Reduction Redu,
                      _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc)) {
     _CODELOCARG(&CodeLoc);
+    //std::cout << "parallel_for nd_range<dims>, redu\n";
     return submit(
         [&](handler &CGH) {
           CGH.template parallel_for<KernelName, KernelType, Dims, Reduction>(
@@ -1077,6 +1297,7 @@ class __SYCL_EXPORT queue {
         },
         CodeLoc);
   }
+  */
 
 // Clean up CODELOC and KERNELFUNC macros.
 #undef _CODELOCPARAM
@@ -1327,27 +1548,32 @@ template <> struct hash<cl::sycl::queue> {
 inline void sycl::queue::submit(
   sycl::ext::oneapi::experimental::executable_graph& g) {
   g.exec_and_wait();
+  first_graph_submission = false;
 }
 
 inline void sycl::queue::begin_capture(
   sycl::ext::oneapi::experimental::graph* g) {
   my_graph_ptr = g;
+  in_capture = true;
 }
 
-inline void sycl::queue::end_capture() const {}
+inline void sycl::queue::end_capture() {
+  in_capture = false;
+}
 
 template <typename KernelName, typename KernelType>
 sycl::event sycl::queue::parallel_for(
   range<1> NumWorkItems,
   const KernelType& KernelFunc) {
   
-  if (!is_capture()) {
-    std::cout << "in queue, not use capture mode\n\n";
+  if (!is_capture() && in_capture) {
+    //std::cout << "in queue, not use capture mode\n\n";
     return parallel_for_impl<KernelName>(NumWorkItems, KernelFunc);
   }
   
   else {
-    std::cout << "in queue, use capture mode\n\n";
+    //std::cout << "in queue, use capture mode\n\n";
+    auto n = 
     my_graph_ptr->add_node([=](sycl::handler& h){
       h.template parallel_for<KernelName, KernelType>(
         NumWorkItems, KernelFunc);
@@ -1355,8 +1581,17 @@ sycl::event sycl::queue::parallel_for(
       {},
       true
     );
-   
-    return sycl::event{};
+    
+    size_t uid = my_graph_ptr->get_id();
+    auto e = sycl::event{};
+    e.set_in_capture(true);
+    e.set_id(uid);
+
+    n.my_node->nid = uid;
+    my_graph_ptr->set_id(++uid);
+    my_graph_ptr->id2node[uid] = n; 
+    
+    return e;
   }
 }
 
@@ -1368,23 +1603,33 @@ sycl::event sycl::queue::parallel_for(
   const KernelType& KernelFunc) {
 
   if (!is_capture()) {
-    std::cout << "in queue with one event, not use capture mode\n\n";
+    //std::cout << "in queue with one event, not use capture mode\n\n";
     return parallel_for_impl<KernelName>(NumWorkItems, DepEvent, KernelFunc);
   }
   
   else {
-    std::cout << "in queue with one event, use capture mode\n\n";
+    //std::cout << "in queue with one event, use capture mode\n\n";
+    auto DepNode = my_graph_ptr->id2node[DepEvent.get_id()];
+    auto n = 
     my_graph_ptr->add_node([=](sycl::handler& h){
-      h.depends_on(DepEvent);
+      //h.depends_on(DepEvent);
       h.template parallel_for<KernelName, KernelType>(
         NumWorkItems, KernelFunc);
       }, 
-      {},
+      {DepNode},
       true
     );
     //auto e = parallel_for_impl<KernelName>(NumWorkItems, DepEvent, KernelFunc);
+    size_t uid = my_graph_ptr->get_id();
+    auto e = sycl::event{};
+    e.set_in_capture(true);
+    e.set_id(uid);
+
+    n.my_node->nid = uid;
+    my_graph_ptr->set_id(++uid);
+    my_graph_ptr->id2node[uid] = n; 
     
-    return sycl::event{};
+    return e;
   }
 }
 
@@ -1395,7 +1640,7 @@ sycl::event sycl::queue::parallel_for(
   const KernelType& KernelFunc) {
   
   if (!is_capture()) {
-    std::cout << "in queue with events, not use capture mode\n\n";
+    //std::cout << "in queue with events, not use capture mode\n\n";
     return parallel_for_impl<KernelName>(
       NumWorkItems, 
       DepEvents, 
@@ -1404,18 +1649,603 @@ sycl::event sycl::queue::parallel_for(
   }
 
   else {
+    std::vector<sycl::ext::oneapi::experimental::node>
+      DepNodes(DepEvents.size());
+
+    for (size_t i = 0; i < DepEvents.size(); ++i) {
+      DepNodes[i] = my_graph_ptr->id2node[DepEvents[i].get_id()];
+    }
+
     std::cout << "in queue with events, use capture mode\n\n";
+    auto n = 
+    my_graph_ptr->add_node([=](sycl::handler& h){
+      //h.depends_on(DepEvents);
+      h.template parallel_for<KernelName, KernelType>(
+        NumWorkItems, KernelFunc);
+      }, 
+      DepNodes,
+      true
+    );
+
+    //auto e = parallel_for_impl<KernelName>(NumWorkItems, DepEvents, KernelFunc);
+    size_t uid = my_graph_ptr->get_id();
+    auto e = sycl::event{};
+    e.set_in_capture(true);
+    e.set_id(uid);
+
+    n.my_node->nid = uid;
+    my_graph_ptr->set_id(++uid);
+    my_graph_ptr->id2node[uid] = n; 
+    
+    return e;
+  }
+}
+
+template <typename KernelName, typename KernelType>
+sycl::event sycl::queue::parallel_for(
+  range<2> NumWorkItems,
+  const KernelType& KernelFunc) {
+  
+  if (!is_capture() && in_capture) {
+    //std::cout << "in queue, not use capture mode\n\n";
+    return parallel_for_impl<KernelName>(NumWorkItems, KernelFunc);
+  }
+  
+  else {
+    //std::cout << "in queue, use capture mode\n\n";
+    auto n = 
     my_graph_ptr->add_node([=](sycl::handler& h){
-      h.depends_on(DepEvents);
       h.template parallel_for<KernelName, KernelType>(
         NumWorkItems, KernelFunc);
       }, 
       {},
       true
     );
+    
+    size_t uid = my_graph_ptr->get_id();
+    auto e = sycl::event{};
+    e.set_in_capture(true);
+    e.set_id(uid);
+
+    n.my_node->nid = uid;
+    my_graph_ptr->set_id(++uid);
+    my_graph_ptr->id2node[uid] = n; 
+    
+    return e;
+  }
+}
+
+template <typename KernelName, typename KernelType>
+sycl::event sycl::queue::parallel_for(
+  sycl::range<2> NumWorkItems,
+  sycl::event DepEvent, 
+  const KernelType& KernelFunc) {
+
+  if (!is_capture()) {
+    //std::cout << "in queue with one event, not use capture mode\n\n";
+    return parallel_for_impl<KernelName>(NumWorkItems, DepEvent, KernelFunc);
+  }
+  
+  else {
+    //std::cout << "in queue with one event, use capture mode\n\n";
+    auto DepNode = my_graph_ptr->id2node[DepEvent.get_id()];
+    auto n = 
+    my_graph_ptr->add_node([=](sycl::handler& h){
+      //h.depends_on(DepEvent);
+      h.template parallel_for<KernelName, KernelType>(
+        NumWorkItems, KernelFunc);
+      }, 
+      {DepNode},
+      true
+    );
+    //auto e = parallel_for_impl<KernelName>(NumWorkItems, DepEvent, KernelFunc);
+    size_t uid = my_graph_ptr->get_id();
+    auto e = sycl::event{};
+    e.set_in_capture(true);
+    e.set_id(uid);
+
+    n.my_node->nid = uid;
+    my_graph_ptr->set_id(++uid);
+    my_graph_ptr->id2node[uid] = n; 
+    
+    return e;
+  }
+}
+
+template <typename KernelName, typename KernelType>
+sycl::event sycl::queue::parallel_for(
+  sycl::range<2> NumWorkItems,
+  const std::vector<sycl::event> &DepEvents,
+  const KernelType& KernelFunc) {
+  
+  if (!is_capture()) {
+    //std::cout << "in queue with events, not use capture mode\n\n";
+    return parallel_for_impl<KernelName>(
+      NumWorkItems, 
+      DepEvents, 
+      KernelFunc,
+      detail::code_location::current());
+  }
+
+  else {
+    std::vector<sycl::ext::oneapi::experimental::node>
+      DepNodes(DepEvents.size());
+
+    for (size_t i = 0; i < DepEvents.size(); ++i) {
+      DepNodes[i] = my_graph_ptr->id2node[DepEvents[i].get_id()];
+    }
+
+    std::cout << "in queue with events, use capture mode\n\n";
+    auto n = 
+    my_graph_ptr->add_node([=](sycl::handler& h){
+      //h.depends_on(DepEvents);
+      h.template parallel_for<KernelName, KernelType>(
+        NumWorkItems, KernelFunc);
+      }, 
+      DepNodes,
+      true
+    );
+
     //auto e = parallel_for_impl<KernelName>(NumWorkItems, DepEvents, KernelFunc);
+    size_t uid = my_graph_ptr->get_id();
+    auto e = sycl::event{};
+    e.set_in_capture(true);
+    e.set_id(uid);
+
+    n.my_node->nid = uid;
+    my_graph_ptr->set_id(++uid);
+    my_graph_ptr->id2node[uid] = n; 
+    
+    return e;
+  }
+}
+
+template <typename KernelName, typename KernelType>
+sycl::event sycl::queue::parallel_for(
+  range<3> NumWorkItems,
+  const KernelType& KernelFunc) {
+  
+  if (!is_capture() && in_capture) {
+    //std::cout << "in queue, not use capture mode\n\n";
+    return parallel_for_impl<KernelName>(NumWorkItems, KernelFunc);
+  }
+  
+  else {
+    //std::cout << "in queue, use capture mode\n\n";
+    auto n = 
+    my_graph_ptr->add_node([=](sycl::handler& h){
+      h.template parallel_for<KernelName, KernelType>(
+        NumWorkItems, KernelFunc);
+      }, 
+      {},
+      true
+    );
     
-    return sycl::event{};
+    size_t uid = my_graph_ptr->get_id();
+    auto e = sycl::event{};
+    e.set_in_capture(true);
+    e.set_id(uid);
+
+    n.my_node->nid = uid;
+    my_graph_ptr->set_id(++uid);
+    my_graph_ptr->id2node[uid] = n; 
+    
+    return e;
+  }
+}
+
+template <typename KernelName, typename KernelType>
+sycl::event sycl::queue::parallel_for(
+  sycl::range<3> NumWorkItems,
+  sycl::event DepEvent, 
+  const KernelType& KernelFunc) {
+
+  if (!is_capture()) {
+    //std::cout << "in queue with one event, not use capture mode\n\n";
+    return parallel_for_impl<KernelName>(NumWorkItems, DepEvent, KernelFunc);
+  }
+  
+  else {
+    //std::cout << "in queue with one event, use capture mode\n\n";
+    auto DepNode = my_graph_ptr->id2node[DepEvent.get_id()];
+    auto n = 
+    my_graph_ptr->add_node([=](sycl::handler& h){
+      //h.depends_on(DepEvent);
+      h.template parallel_for<KernelName, KernelType>(
+        NumWorkItems, KernelFunc);
+      }, 
+      {DepNode},
+      true
+    );
+    //auto e = parallel_for_impl<KernelName>(NumWorkItems, DepEvent, KernelFunc);
+    size_t uid = my_graph_ptr->get_id();
+    auto e = sycl::event{};
+    e.set_in_capture(true);
+    e.set_id(uid);
+
+    n.my_node->nid = uid;
+    my_graph_ptr->set_id(++uid);
+    my_graph_ptr->id2node[uid] = n; 
+    
+    return e;
+  }
+}
+
+template <typename KernelName, typename KernelType>
+sycl::event sycl::queue::parallel_for(
+  sycl::range<3> NumWorkItems,
+  const std::vector<sycl::event> &DepEvents,
+  const KernelType& KernelFunc) {
+  
+  if (!is_capture()) {
+    //std::cout << "in queue with events, not use capture mode\n\n";
+    return parallel_for_impl<KernelName>(
+      NumWorkItems, 
+      DepEvents, 
+      KernelFunc,
+      detail::code_location::current());
+  }
+
+  else {
+    std::vector<sycl::ext::oneapi::experimental::node>
+      DepNodes(DepEvents.size());
+
+    for (size_t i = 0; i < DepEvents.size(); ++i) {
+      DepNodes[i] = my_graph_ptr->id2node[DepEvents[i].get_id()];
+    }
+
+    std::cout << "in queue with events, use capture mode\n\n";
+    auto n = 
+    my_graph_ptr->add_node([=](sycl::handler& h){
+      //h.depends_on(DepEvents);
+      h.template parallel_for<KernelName, KernelType>(
+        NumWorkItems, KernelFunc);
+      }, 
+      DepNodes,
+      true
+    );
+
+    //auto e = parallel_for_impl<KernelName>(NumWorkItems, DepEvents, KernelFunc);
+    size_t uid = my_graph_ptr->get_id();
+    auto e = sycl::event{};
+    e.set_in_capture(true);
+    e.set_id(uid);
+
+    n.my_node->nid = uid;
+    my_graph_ptr->set_id(++uid);
+    my_graph_ptr->id2node[uid] = n; 
+    
+    return e;
+  }
+}
+
+template <typename KernelName, typename KernelType, int Dims>
+sycl::event sycl::queue::parallel_for(
+  sycl::nd_range<Dims> ExecutionRange,
+  const KernelType& KernelFunc) {
+  
+  //std::cout << "parallel_for nd_range<dims>\n";
+  if (!is_capture() && in_capture) {
+    return submit(
+      [&](handler &CGH) {
+        CGH.template parallel_for<KernelName, KernelType>(ExecutionRange,
+                                                          KernelFunc);
+      });
+  }
+  else {
+    auto n = 
+    my_graph_ptr->add_node([=](sycl::handler& h){
+      h.template parallel_for<KernelName, KernelType>(
+        ExecutionRange, KernelFunc);
+      }, 
+      {},
+      true
+    );
+    size_t uid = my_graph_ptr->get_id();
+    auto e = sycl::event{};
+    e.set_in_capture(true);
+    e.set_id(uid);
+
+    n.my_node->nid = uid;
+    my_graph_ptr->set_id(++uid);
+    my_graph_ptr->id2node[uid] = n; 
+    
+    return e;
+
+  }
+}
+
+template <typename KernelName, typename KernelType, int Dims>
+sycl::event sycl::queue::parallel_for(
+  sycl::nd_range<Dims> ExecutionRange, 
+  sycl::event DepEvent,
+  const KernelType& KernelFunc) {
+
+  //std::cout << "parallel_for nd_range<dims>\n";
+  if (!is_capture() && in_capture) {
+    return submit(
+      [&](handler &CGH) {
+        CGH.depends_on(DepEvent);
+        CGH.template parallel_for<KernelName, KernelType>(ExecutionRange,
+                                                          KernelFunc);
+      });
+  }
+  else {
+    //std::cout << "in queue with one event, use capture mode\n\n";
+    auto DepNode = my_graph_ptr->id2node[DepEvent.get_id()];
+    auto n = 
+    my_graph_ptr->add_node([=](sycl::handler& h){
+      h.template parallel_for<KernelName, KernelType>(
+        ExecutionRange, KernelFunc);
+      }, 
+      {DepNode},
+      true
+    );
+    size_t uid = my_graph_ptr->get_id();
+    auto e = sycl::event{};
+    e.set_in_capture(true);
+    e.set_id(uid);
+
+    n.my_node->nid = uid;
+    my_graph_ptr->set_id(++uid);
+    my_graph_ptr->id2node[uid] = n; 
+    
+    return e;
+
+  }
+}
+
+template <typename KernelName, typename KernelType, int Dims>
+sycl::event sycl::queue::parallel_for(
+  sycl::nd_range<Dims> ExecutionRange,
+  const std::vector<event> &DepEvents,
+  const KernelType& KernelFunc){
+
+  //std::cout << "parallel_for nd_range<dims>\n";
+  if (!is_capture() && in_capture) {
+    return submit(
+      [&](handler &CGH) {
+        CGH.depends_on(DepEvents);
+        CGH.template parallel_for<KernelName, KernelType>(ExecutionRange,
+                                                          KernelFunc);
+      });
+  }
+  else {
+    //std::cout << "in queue with one event, use capture mode\n\n";
+    std::vector<sycl::ext::oneapi::experimental::node>
+      DepNodes(DepEvents.size());
+
+    for (size_t i = 0; i < DepEvents.size(); ++i) {
+      DepNodes[i] = my_graph_ptr->id2node[DepEvents[i].get_id()];
+    }
+    auto n = 
+    my_graph_ptr->add_node([=](sycl::handler& h){
+      h.template parallel_for<KernelName, KernelType>(
+        ExecutionRange, KernelFunc);
+      }, 
+      DepNodes,
+      true
+    );
+    size_t uid = my_graph_ptr->get_id();
+    auto e = sycl::event{};
+    e.set_in_capture(true);
+    e.set_id(uid);
+
+    n.my_node->nid = uid;
+    my_graph_ptr->set_id(++uid);
+    my_graph_ptr->id2node[uid] = n; 
+    
+    return e;
+
+  }
+}
+
+template <typename KernelName, typename KernelType,
+          int Dims, typename Reduction>
+sycl::event sycl::queue::parallel_for(
+  sycl::nd_range<Dims> ExecutionRange, 
+  Reduction Redu,
+  const KernelType& KernelFunc) {
+  
+  //std::cout << "parallel_for nd_range<dims>, redu\n";
+  if (!is_capture() && in_capture) {
+    return submit(
+      [&](handler &CGH) {
+        CGH.template parallel_for<KernelName, KernelType, Dims, Reduction>(
+            ExecutionRange, Redu, KernelFunc);
+      }
+    );
+  }
+  else {
+    auto n = 
+    my_graph_ptr->add_node([=](sycl::handler& h){
+      h.template parallel_for<KernelName, KernelType, Dims, Reduction>(
+        ExecutionRange, Redu, KernelFunc);
+      }, 
+      {},
+      true
+    );
+    
+    size_t uid = my_graph_ptr->get_id();
+    auto e = sycl::event{};
+    e.set_in_capture(true);
+    e.set_id(uid);
+
+    n.my_node->nid = uid;
+    my_graph_ptr->set_id(++uid);
+    my_graph_ptr->id2node[uid] = n; 
+    
+    return e;
+
+  }
+}
+
+template <typename KernelName, typename KernelType, int Dims>
+sycl::event sycl::queue::parallel_for(
+  range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
+  const KernelType& KernelFunc) {
+  
+  if (!is_capture() && in_capture) {
+    return submit(
+      [&](handler &CGH) {
+        CGH.template parallel_for<KernelName, KernelType>(
+            NumWorkItems, WorkItemOffset, KernelFunc);
+      }
+    );
+  }
+  else {
+    auto n = 
+    my_graph_ptr->add_node([=](sycl::handler& h){
+      h.template parallel_for<KernelName, KernelType>(
+        NumWorkItems, WorkItemOffset, KernelFunc);
+      }, 
+      {},
+      true
+    );
+  
+    size_t uid = my_graph_ptr->get_id();
+    auto e = sycl::event{};
+    e.set_in_capture(true);
+    e.set_id(uid);
+
+    n.my_node->nid = uid;
+    my_graph_ptr->set_id(++uid);
+    my_graph_ptr->id2node[uid] = n; 
+    
+    return e;
+  }
+}
+
+template <typename KernelName, typename KernelType, int Dims>
+sycl::event sycl::queue::parallel_for(
+  range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
+  event DepEvent, const KernelType& KernelFunc) {
+  if (!is_capture() && in_capture) {
+    return submit(
+      [&](handler &CGH) {
+        CGH.depends_on(DepEvent);
+        CGH.template parallel_for<KernelName, KernelType>(
+          NumWorkItems, WorkItemOffset, KernelFunc);
+      }
+    );
+  }
+  else {
+
+    auto DepNode = my_graph_ptr->id2node[DepEvent.get_id()];
+    auto n = 
+    my_graph_ptr->add_node([=](sycl::handler& h){
+      h.template parallel_for<KernelName, KernelType>(
+        NumWorkItems, WorkItemOffset, KernelFunc);
+      }, 
+      {DepNode},
+      true
+    );
+    size_t uid = my_graph_ptr->get_id();
+    auto e = sycl::event{};
+    e.set_in_capture(true);
+    e.set_id(uid);
+
+    n.my_node->nid = uid;
+    my_graph_ptr->set_id(++uid);
+    my_graph_ptr->id2node[uid] = n; 
+    
+    return e;
+  }
+}
+  
+template <typename KernelName, typename KernelType, int Dims>
+sycl::event sycl::queue::parallel_for(
+  range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
+  const std::vector<event> &DepEvents,
+  const KernelType& KernelFunc) {
+  
+  if (!is_capture() && in_capture) {
+    return submit(
+      [&](handler &CGH) {
+        CGH.depends_on(DepEvents);
+        CGH.template parallel_for<KernelName, KernelType>(
+          NumWorkItems, WorkItemOffset, KernelFunc);
+      }
+    );
+  }
+  else {
+    std::vector<sycl::ext::oneapi::experimental::node>
+      DepNodes(DepEvents.size());
+
+    for (size_t i = 0; i < DepEvents.size(); ++i) {
+      DepNodes[i] = my_graph_ptr->id2node[DepEvents[i].get_id()];
+    }
+
+    auto n = 
+    my_graph_ptr->add_node([=](sycl::handler& h){
+      h.template parallel_for<KernelName, KernelType>(
+        NumWorkItems, WorkItemOffset, KernelFunc);
+      }, 
+      DepNodes,
+      true
+    );
+
+    size_t uid = my_graph_ptr->get_id();
+    auto e = sycl::event{};
+    e.set_in_capture(true);
+    e.set_id(uid);
+
+    n.my_node->nid = uid;
+    my_graph_ptr->set_id(++uid);
+    my_graph_ptr->id2node[uid] = n; 
+    
+    return e;
+  }
+}
+
+template <typename T>
+sycl::event sycl::queue::submit(
+  T CGF,
+  const detail::code_location &CodeLoc){
+
+#if __SYCL_USE_FALLBACK_ASSERT
+  if (!is_host()) {
+    auto PostProcess = [this, &CodeLoc](bool IsKernel, bool KernelUsesAssert,
+                                        event &E) {
+      if (IsKernel && !device_has(aspect::ext_oneapi_native_assert) &&
+          KernelUsesAssert && !device_has(aspect::accelerator)) {
+        // __devicelib_assert_fail isn't supported by Device-side Runtime
+        // Linking against fallback impl of __devicelib_assert_fail is
+        // performed by program manager class
+        // Fallback assert isn't supported for FPGA
+        submitAssertCapture(*this, E, /* SecondaryQueue = */ nullptr,
+                            CodeLoc);
+      }
+    };
+
+    auto Event = submit_impl_and_postprocess(CGF, CodeLoc, PostProcess);
+    return discard_or_return(Event);
+  } else
+#endif // __SYCL_USE_FALLBACK_ASSERT
+  {
+    if (!is_capture()) {
+      std::cout << "in queue submit, not use capture mode\n\n";
+      auto Event = submit_impl(CGF, CodeLoc);
+      return discard_or_return(Event);
+    }
+    else {
+      //std::cout << "in queue submit, use capture mode and "
+      //          << "in_capture = " << in_capture << "\n\n";
+      
+      //auto Event = submit_impl(CGF, CodeLoc);
+      if (in_capture) {
+        my_graph_ptr->add_node(CGF, {}, true);
+        //auto Event = submit_impl(CGF, CodeLoc);
+        auto Event = sycl::event{};
+        Event.set_in_capture(true);
+        return Event;
+      }
+      else {
+        auto Event = submit_impl(CGF, CodeLoc);
+        return discard_or_return(Event);
+      }
+      //return discard_or_return(Event);
+    }
   }
 }
 

From 4530a2cbca7917e0ea5c87c93b64602ef97df2f7 Mon Sep 17 00:00:00 2001
From: cheng-hsiang-chiu <cheng-hsiang.chiu@intel.com>
Date: Thu, 11 Aug 2022 18:44:27 -0700
Subject: [PATCH 40/43] Deleted debugging msg

---
 sycl/source/event.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/sycl/source/event.cpp b/sycl/source/event.cpp
index 90c799ce46d26..3108cb75b88a8 100644
--- a/sycl/source/event.cpp
+++ b/sycl/source/event.cpp
@@ -45,8 +45,6 @@ cl_event event::get() const { return impl->get(); }
 bool event::is_host() const { return impl->is_host(); }
 
 void event::wait() {
-  std::cout << "event::wait()\n";
-
   // if event::wait() is not called inside the capture window.
   if (!get_in_capture()) {
     impl->wait(impl); 
@@ -60,8 +58,6 @@ void event::wait(const std::vector<event> &EventList) {
 }
 
 void event::wait_and_throw() {
-  std::cout << "wait and throw option 5 and in_capture = "
-            << get_in_capture() << "\n";
   // directly returns if the event.wait_and_throw() 
   // is called inside the capture window
   if (get_in_capture()) {
@@ -71,7 +67,6 @@ void event::wait_and_throw() {
 }
 
 void event::wait_and_throw(const std::vector<event> &EventList) {
-  std::cout << "wait and throw option 6\n";
   for (auto E : EventList) {
     E.wait_and_throw();
   }

From a056e08e9a4a81908e36663118b206e7b298ca5f Mon Sep 17 00:00:00 2001
From: cheng-hsiang-chiu <cheng-hsiang.chiu@intel.com>
Date: Thu, 11 Aug 2022 18:49:12 -0700
Subject: [PATCH 41/43] Deleted debugging msg

---
 sycl/source/detail/queue_impl.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index 70e0414d91a0d..762a1e151be9b 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -262,7 +262,6 @@ class queue_impl {
 
   /// @param Loc is the code location of the submit call (default argument)
   void wait_and_throw(const detail::code_location &Loc = {}) {
-    //std::cout << "wait and throw option 4\n";
     wait(Loc);
     throw_asynchronous();
   }

From 95bb4a9cf229ce8784b54b3b3292267d23c54865 Mon Sep 17 00:00:00 2001
From: cheng-hsiang-chiu <cheng-hsiang.chiu@intel.com>
Date: Thu, 11 Aug 2022 18:53:55 -0700
Subject: [PATCH 42/43] Deleted debugging msg

---
 sycl/include/sycl/ext/oneapi/experimental/graph.hpp | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
index 9c00f9a33a5f1..47bb97ded0443 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp
@@ -60,7 +60,6 @@ struct node_impl {
   std::function<void(sycl::handler &)> my_body;
 
   inline void exec(sycl::queue q) {
-    //std::cout << "node_imple exec\n";
     std::vector<sycl::event> __deps;
     std::vector<node_ptr> pred_nodes = my_predecessors;
     while (!pred_nodes.empty()) {
@@ -74,7 +73,6 @@ struct node_impl {
         __deps.push_back(curr_node->get_event());
     }
     if (my_body && !is_empty) {
-      //std::cout << "node_impl.exec, q.submit \n";
       my_event = q.submit(wrapper{my_body, __deps});
     }
   }
@@ -114,7 +112,6 @@ struct graph_impl {
   graph_ptr parent;
 
   inline void exec(sycl::queue q) {
-    //std::cout << "graph_impl, exec\n";
     if (my_schedule.empty()) {
       for (auto n : my_roots) {
         n->topology_sort(my_schedule);
@@ -125,7 +122,6 @@ struct graph_impl {
   }
 
   inline void exec_and_wait(sycl::queue q) {
-    //std::cout << "graph_impl, exec_and_wait\n";
     exec(q);
     q.wait();
   }
@@ -176,7 +172,6 @@ struct node {
 
   inline void register_successor(node n) { my_node->register_successor(n.my_node); }
   inline void exec(sycl::queue q, sycl::event = sycl::event()) {
-    //std::cout << "node, exec\n";
     my_node->exec(q); }
 
   inline void set_root() { my_graph->add_root(my_node); }
@@ -193,7 +188,6 @@ class executable_graph {
 
   executable_graph(detail::graph_ptr g, sycl::queue q)
       : my_tag(rand()), my_queue(q) {
-    //std::cout << "executable_graph constructor\n";
     g->exec(my_queue);
   }
 };
@@ -353,7 +347,6 @@ class graph {
   void exec_and_wait(sycl::queue q);
 
   inline executable_graph instantiate(sycl::queue q) {
-    //std::cout << "executable_graph::instantiate \n";
     return executable_graph{my_graph, q};
   };
 
@@ -390,7 +383,6 @@ class graph {
 };
 
 inline void executable_graph::exec_and_wait() {
-  //std::cout << "executable_graph::exec_and_wait() \n";
   my_queue.wait();
 }
 
@@ -414,16 +406,12 @@ inline node graph::add_node(T cgf, const std::vector<node> &dep, const bool capt
   else {
     // first node ever
     if (!ptr_prev_node) {
-      //std::cout << "first node is null, ptr_prev_node = " << ptr_prev_node << '\n';
       _node.set_root();
       ptr_prev_node = _node.my_node;
-      //std::cout << "finish if\n";
     }
     else {
-      //std::cout << "first node exists, ptr_prev_node = " << ptr_prev_node << '\n';
       ptr_prev_node->register_successor(_node.my_node);
       ptr_prev_node = _node.my_node;
-      //std::cout << "finish else\n";
     }
   }
   return _node;
@@ -755,7 +743,6 @@ inline void graph::single_task(node &Node, const KernelType &(KernelFunc),
 template <typename KernelName, typename KernelType>
 inline node graph::parallel_for(range<1> NumWorkItems, const KernelType &(KernelFunc),
                          const std::vector<node> &dep) {
-  //std::cout << "in graph, use this parallel_for\n";
   return graph::add_node(
       [=](sycl::handler &h) {
         h.template parallel_for<KernelName, KernelType>(NumWorkItems,

From 24429911f570d561af9e2274f74657eb6698eb8d Mon Sep 17 00:00:00 2001
From: cheng-hsiang-chiu <cheng-hsiang.chiu@intel.com>
Date: Thu, 11 Aug 2022 19:06:13 -0700
Subject: [PATCH 43/43] Added comments and Deleted debugging msg

---
 sycl/include/CL/sycl/queue.hpp | 48 +++++++++-------------------------
 1 file changed, 13 insertions(+), 35 deletions(-)

diff --git a/sycl/include/CL/sycl/queue.hpp b/sycl/include/CL/sycl/queue.hpp
index c14ba5f3cb36e..b5d3f4df23ac7 100644
--- a/sycl/include/CL/sycl/queue.hpp
+++ b/sycl/include/CL/sycl/queue.hpp
@@ -605,7 +605,6 @@ class __SYCL_EXPORT queue {
   /// Synchronous errors will be reported through SYCL exceptions.
   /// @param CodeLoc is the code location of the submit call (default argument)
   void wait(_CODELOCONLYPARAM(&CodeLoc)) {
-    //std::cout << "queue.hpp, queue.wait() \n";
     _CODELOCARG(&CodeLoc);
 
     wait_proxy(CodeLoc);
@@ -621,7 +620,6 @@ class __SYCL_EXPORT queue {
   /// @param CodeLoc is the code location of the submit call (default argument)
   void wait_and_throw(_CODELOCONLYPARAM(&CodeLoc)) {
     _CODELOCARG(&CodeLoc);
-    //std::cout << "wait and throw option 2\n";
     wait_and_throw_proxy(CodeLoc);
   }
 
@@ -1311,7 +1309,14 @@ class __SYCL_EXPORT queue {
   /// Equivalent to has_property<property::queue::in_order>()
   bool is_in_order() const;
 
+  /// Returns a boolean if queue is in lazy execution 
+  ///  
+  /// \return a boolean if queue is in lazy execution 
   bool is_lazy() const;
+
+  /// Returns a boolean if queue is in capture mode 
+  ///  
+  /// \return a boolean if queue is in capture mode 
   bool is_capture() const;
 
 
@@ -1567,12 +1572,10 @@ sycl::event sycl::queue::parallel_for(
   const KernelType& KernelFunc) {
   
   if (!is_capture() && in_capture) {
-    //std::cout << "in queue, not use capture mode\n\n";
     return parallel_for_impl<KernelName>(NumWorkItems, KernelFunc);
   }
   
   else {
-    //std::cout << "in queue, use capture mode\n\n";
     auto n = 
     my_graph_ptr->add_node([=](sycl::handler& h){
       h.template parallel_for<KernelName, KernelType>(
@@ -1602,13 +1605,11 @@ sycl::event sycl::queue::parallel_for(
   sycl::event DepEvent, 
   const KernelType& KernelFunc) {
 
-  if (!is_capture()) {
-    //std::cout << "in queue with one event, not use capture mode\n\n";
+  if (!is_capture() && in_capture) {
     return parallel_for_impl<KernelName>(NumWorkItems, DepEvent, KernelFunc);
   }
   
   else {
-    //std::cout << "in queue with one event, use capture mode\n\n";
     auto DepNode = my_graph_ptr->id2node[DepEvent.get_id()];
     auto n = 
     my_graph_ptr->add_node([=](sycl::handler& h){
@@ -1639,8 +1640,7 @@ sycl::event sycl::queue::parallel_for(
   const std::vector<sycl::event> &DepEvents,
   const KernelType& KernelFunc) {
   
-  if (!is_capture()) {
-    //std::cout << "in queue with events, not use capture mode\n\n";
+  if (!is_capture() && in_capture) {
     return parallel_for_impl<KernelName>(
       NumWorkItems, 
       DepEvents, 
@@ -1656,7 +1656,6 @@ sycl::event sycl::queue::parallel_for(
       DepNodes[i] = my_graph_ptr->id2node[DepEvents[i].get_id()];
     }
 
-    std::cout << "in queue with events, use capture mode\n\n";
     auto n = 
     my_graph_ptr->add_node([=](sycl::handler& h){
       //h.depends_on(DepEvents);
@@ -1687,12 +1686,10 @@ sycl::event sycl::queue::parallel_for(
   const KernelType& KernelFunc) {
   
   if (!is_capture() && in_capture) {
-    //std::cout << "in queue, not use capture mode\n\n";
     return parallel_for_impl<KernelName>(NumWorkItems, KernelFunc);
   }
   
   else {
-    //std::cout << "in queue, use capture mode\n\n";
     auto n = 
     my_graph_ptr->add_node([=](sycl::handler& h){
       h.template parallel_for<KernelName, KernelType>(
@@ -1721,13 +1718,11 @@ sycl::event sycl::queue::parallel_for(
   sycl::event DepEvent, 
   const KernelType& KernelFunc) {
 
-  if (!is_capture()) {
-    //std::cout << "in queue with one event, not use capture mode\n\n";
+  if (!is_capture() && in_capture) {
     return parallel_for_impl<KernelName>(NumWorkItems, DepEvent, KernelFunc);
   }
   
   else {
-    //std::cout << "in queue with one event, use capture mode\n\n";
     auto DepNode = my_graph_ptr->id2node[DepEvent.get_id()];
     auto n = 
     my_graph_ptr->add_node([=](sycl::handler& h){
@@ -1758,8 +1753,7 @@ sycl::event sycl::queue::parallel_for(
   const std::vector<sycl::event> &DepEvents,
   const KernelType& KernelFunc) {
   
-  if (!is_capture()) {
-    //std::cout << "in queue with events, not use capture mode\n\n";
+  if (!is_capture() && in_capture) {
     return parallel_for_impl<KernelName>(
       NumWorkItems, 
       DepEvents, 
@@ -1775,7 +1769,6 @@ sycl::event sycl::queue::parallel_for(
       DepNodes[i] = my_graph_ptr->id2node[DepEvents[i].get_id()];
     }
 
-    std::cout << "in queue with events, use capture mode\n\n";
     auto n = 
     my_graph_ptr->add_node([=](sycl::handler& h){
       //h.depends_on(DepEvents);
@@ -1806,12 +1799,10 @@ sycl::event sycl::queue::parallel_for(
   const KernelType& KernelFunc) {
   
   if (!is_capture() && in_capture) {
-    //std::cout << "in queue, not use capture mode\n\n";
     return parallel_for_impl<KernelName>(NumWorkItems, KernelFunc);
   }
   
   else {
-    //std::cout << "in queue, use capture mode\n\n";
     auto n = 
     my_graph_ptr->add_node([=](sycl::handler& h){
       h.template parallel_for<KernelName, KernelType>(
@@ -1840,13 +1831,11 @@ sycl::event sycl::queue::parallel_for(
   sycl::event DepEvent, 
   const KernelType& KernelFunc) {
 
-  if (!is_capture()) {
-    //std::cout << "in queue with one event, not use capture mode\n\n";
+  if (!is_capture() && in_capture) {
     return parallel_for_impl<KernelName>(NumWorkItems, DepEvent, KernelFunc);
   }
   
   else {
-    //std::cout << "in queue with one event, use capture mode\n\n";
     auto DepNode = my_graph_ptr->id2node[DepEvent.get_id()];
     auto n = 
     my_graph_ptr->add_node([=](sycl::handler& h){
@@ -1877,8 +1866,7 @@ sycl::event sycl::queue::parallel_for(
   const std::vector<sycl::event> &DepEvents,
   const KernelType& KernelFunc) {
   
-  if (!is_capture()) {
-    //std::cout << "in queue with events, not use capture mode\n\n";
+  if (!is_capture() && in_capture) {
     return parallel_for_impl<KernelName>(
       NumWorkItems, 
       DepEvents, 
@@ -1894,7 +1882,6 @@ sycl::event sycl::queue::parallel_for(
       DepNodes[i] = my_graph_ptr->id2node[DepEvents[i].get_id()];
     }
 
-    std::cout << "in queue with events, use capture mode\n\n";
     auto n = 
     my_graph_ptr->add_node([=](sycl::handler& h){
       //h.depends_on(DepEvents);
@@ -1924,7 +1911,6 @@ sycl::event sycl::queue::parallel_for(
   sycl::nd_range<Dims> ExecutionRange,
   const KernelType& KernelFunc) {
   
-  //std::cout << "parallel_for nd_range<dims>\n";
   if (!is_capture() && in_capture) {
     return submit(
       [&](handler &CGH) {
@@ -1961,7 +1947,6 @@ sycl::event sycl::queue::parallel_for(
   sycl::event DepEvent,
   const KernelType& KernelFunc) {
 
-  //std::cout << "parallel_for nd_range<dims>\n";
   if (!is_capture() && in_capture) {
     return submit(
       [&](handler &CGH) {
@@ -1971,7 +1956,6 @@ sycl::event sycl::queue::parallel_for(
       });
   }
   else {
-    //std::cout << "in queue with one event, use capture mode\n\n";
     auto DepNode = my_graph_ptr->id2node[DepEvent.get_id()];
     auto n = 
     my_graph_ptr->add_node([=](sycl::handler& h){
@@ -2001,7 +1985,6 @@ sycl::event sycl::queue::parallel_for(
   const std::vector<event> &DepEvents,
   const KernelType& KernelFunc){
 
-  //std::cout << "parallel_for nd_range<dims>\n";
   if (!is_capture() && in_capture) {
     return submit(
       [&](handler &CGH) {
@@ -2011,7 +1994,6 @@ sycl::event sycl::queue::parallel_for(
       });
   }
   else {
-    //std::cout << "in queue with one event, use capture mode\n\n";
     std::vector<sycl::ext::oneapi::experimental::node>
       DepNodes(DepEvents.size());
 
@@ -2047,7 +2029,6 @@ sycl::event sycl::queue::parallel_for(
   Reduction Redu,
   const KernelType& KernelFunc) {
   
-  //std::cout << "parallel_for nd_range<dims>, redu\n";
   if (!is_capture() && in_capture) {
     return submit(
       [&](handler &CGH) {
@@ -2224,13 +2205,10 @@ sycl::event sycl::queue::submit(
 #endif // __SYCL_USE_FALLBACK_ASSERT
   {
     if (!is_capture()) {
-      std::cout << "in queue submit, not use capture mode\n\n";
       auto Event = submit_impl(CGF, CodeLoc);
       return discard_or_return(Event);
     }
     else {
-      //std::cout << "in queue submit, use capture mode and "
-      //          << "in_capture = " << in_capture << "\n\n";
       
       //auto Event = submit_impl(CGF, CodeLoc);
       if (in_capture) {