diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp
index 23008e75b80fb..6ecf34d005bd3 100644
--- a/sycl/include/sycl/queue.hpp
+++ b/sycl/include/sycl/queue.hpp
@@ -356,11 +356,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
       }
     };
 
-    auto Event = submit_impl_and_postprocess(CGF, CodeLoc, PostProcess);
-    return discard_or_return(Event);
+    return submit_impl_and_postprocess(CGF, CodeLoc, PostProcess);
 #else
-    auto Event = submit_impl(CGF, CodeLoc);
-    return discard_or_return(Event);
+    return submit_impl(CGF, CodeLoc);
 #endif // __SYCL_USE_FALLBACK_ASSERT
   }
 
@@ -395,12 +393,10 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
       }
     };
 
-    auto Event =
-        submit_impl_and_postprocess(CGF, SecondaryQueue, CodeLoc, PostProcess);
-    return discard_or_return(Event);
+    return submit_impl_and_postprocess(CGF, SecondaryQueue, CodeLoc,
+                                       PostProcess);
 #else
-    auto Event = submit_impl(CGF, SecondaryQueue, CodeLoc);
-    return discard_or_return(Event);
+    return submit_impl(CGF, SecondaryQueue, CodeLoc);
 #endif // __SYCL_USE_FALLBACK_ASSERT
   }
 
@@ -2814,6 +2810,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
 
   /// Checks if the event needs to be discarded and if so, discards it and
   /// returns a discarded event. Otherwise, it returns input event.
+  /// TODO: move to impl class in the next ABI Breaking window
   event discard_or_return(const event &Event);
 
   // Function to postprocess submitted command
diff --git a/sycl/source/detail/helpers.cpp b/sycl/source/detail/helpers.cpp
index 4103653d236a7..222b06127207d 100644
--- a/sycl/source/detail/helpers.cpp
+++ b/sycl/source/detail/helpers.cpp
@@ -21,6 +21,8 @@ namespace sycl {
 inline namespace _V1 {
 using ContextImplPtr = std::shared_ptr<sycl::detail::context_impl>;
 namespace detail {
+// TODO: remove from public header files and implementation during the next ABI
+// Breaking window. Not used any more.
 std::vector<sycl::detail::pi::PiEvent>
 getOrWaitEvents(std::vector<sycl::event> DepEvents, ContextImplPtr Context) {
   std::vector<sycl::detail::pi::PiEvent> Events;
diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index b8785e12f0535..6fdfdd7a4d11c 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -26,9 +26,19 @@
 namespace sycl {
 inline namespace _V1 {
 namespace detail {
-
 std::atomic<unsigned long long> queue_impl::MNextAvailableQueueID = 0;
 
+static std::vector<sycl::detail::pi::PiEvent>
+getPIEvents(const std::vector<sycl::event> &DepEvents) {
+  std::vector<sycl::detail::pi::PiEvent> RetPiEvents;
+  for (const sycl::event &Event : DepEvents) {
+    const EventImplPtr &EventImpl = detail::getSyclObjImpl(Event);
+    if (EventImpl->getHandleRef() != nullptr)
+      RetPiEvents.push_back(EventImpl->getHandleRef());
+  }
+  return RetPiEvents;
+}
+
 template <>
 uint32_t queue_impl::get_info<info::queue::reference_count>() const {
   sycl::detail::pi::PiResult result = PI_SUCCESS;
@@ -63,16 +73,25 @@ static event createDiscardedEvent() {
 
 const std::vector<event> &
 queue_impl::getExtendDependencyList(const std::vector<event> &DepEvents,
-                                    std::vector<event> &MutableVec) {
-  if (isInOrder()) {
-    std::optional<event> ExternalEvent = popExternalEvent();
-    if (ExternalEvent) {
-      MutableVec = DepEvents;
-      MutableVec.push_back(*ExternalEvent);
-      return MutableVec;
-    }
-  }
-  return DepEvents;
+                                    std::vector<event> &MutableVec,
+                                    std::unique_lock<std::mutex> &QueueLock) {
+  if (!isInOrder())
+    return DepEvents;
+
+  QueueLock.lock();
+  EventImplPtr ExtraEvent =
+      MGraph.expired() ? MLastEventPtr : MGraphLastEventPtr;
+  std::optional<event> ExternalEvent = popExternalEvent();
+
+  if (!ExternalEvent && !ExtraEvent)
+    return DepEvents;
+
+  MutableVec = DepEvents;
+  if (ExternalEvent)
+    MutableVec.push_back(*ExternalEvent);
+  if (ExtraEvent)
+    MutableVec.push_back(detail::createSyclObjFromImpl<event>(ExtraEvent));
+  return MutableVec;
 }
 
 event queue_impl::memset(const std::shared_ptr<detail::queue_impl> &Self,
@@ -106,7 +125,7 @@ event queue_impl::memset(const std::shared_ptr<detail::queue_impl> &Self,
   }
 
   return submitMemOpHelper(
-      Self, DepEvents,
+      Self, DepEvents, [&](handler &CGH) { CGH.memset(Ptr, Value, Count); },
       [](const auto &...Args) { MemoryManager::fill_usm(Args...); }, Ptr, Self,
       Count, Value);
 }
@@ -152,21 +171,17 @@ event queue_impl::memcpy(const std::shared_ptr<detail::queue_impl> &Self,
 #endif
   // If we have a command graph set we need to capture the copy through normal
   // queue submission rather than execute the copy directly.
-  if (MGraph.lock()) {
-    return submit(
-        [&](handler &CGH) {
-          CGH.depends_on(DepEvents);
-          CGH.memcpy(Dest, Src, Count);
-        },
-        Self, {});
-  }
+  auto HandlerFunc = [&](handler &CGH) { CGH.memcpy(Dest, Src, Count); };
+  if (MGraph.lock())
+    return submitWithHandler(Self, DepEvents, HandlerFunc);
+
   if ((!Src || !Dest) && Count != 0) {
     report(CodeLoc);
     throw runtime_error("NULL pointer argument in memory copy operation.",
                         PI_ERROR_INVALID_VALUE);
   }
   return submitMemOpHelper(
-      Self, DepEvents,
+      Self, DepEvents, HandlerFunc,
       [](const auto &...Args) { MemoryManager::copy_usm(Args...); }, Src, Self,
       Count, Dest);
 }
@@ -177,17 +192,12 @@ event queue_impl::mem_advise(const std::shared_ptr<detail::queue_impl> &Self,
                              const std::vector<event> &DepEvents) {
   // If we have a command graph set we need to capture the advise through normal
   // queue submission.
-  if (MGraph.lock()) {
-    return submit(
-        [&](handler &CGH) {
-          CGH.depends_on(DepEvents);
-          CGH.mem_advise(Ptr, Length, Advice);
-        },
-        Self, {});
-  }
+  auto HandlerFunc = [&](handler &CGH) { CGH.mem_advise(Ptr, Length, Advice); };
+  if (MGraph.lock())
+    return submitWithHandler(Self, DepEvents, HandlerFunc);
 
   return submitMemOpHelper(
-      Self, DepEvents,
+      Self, DepEvents, HandlerFunc,
       [](const auto &...Args) { MemoryManager::advise_usm(Args...); }, Ptr,
       Self, Length, Advice);
 }
@@ -198,6 +208,10 @@ event queue_impl::memcpyToDeviceGlobal(
     const std::vector<event> &DepEvents) {
   return submitMemOpHelper(
       Self, DepEvents,
+      [&](handler &CGH) {
+        CGH.memcpyToDeviceGlobal(DeviceGlobalPtr, Src, IsDeviceImageScope,
+                                 NumBytes, Offset);
+      },
       [](const auto &...Args) {
         MemoryManager::copy_to_device_global(Args...);
       },
@@ -210,15 +224,25 @@ event queue_impl::memcpyFromDeviceGlobal(
     size_t Offset, const std::vector<event> &DepEvents) {
   return submitMemOpHelper(
       Self, DepEvents,
+      [&](handler &CGH) {
+        CGH.memcpyFromDeviceGlobal(Dest, DeviceGlobalPtr, IsDeviceImageScope,
+                                   NumBytes, Offset);
+      },
       [](const auto &...Args) {
         MemoryManager::copy_from_device_global(Args...);
       },
       DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Dest);
 }
 
-event queue_impl::getLastEvent() const {
-  std::lock_guard<std::mutex> Lock{MLastEventMtx};
-  return MDiscardEvents ? createDiscardedEvent() : MLastEvent;
+event queue_impl::getLastEvent() {
+  std::lock_guard<std::mutex> Lock{MMutex};
+  if (MDiscardEvents)
+    return createDiscardedEvent();
+  if (!MGraph.expired() && MGraphLastEventPtr)
+    return detail::createSyclObjFromImpl<event>(MGraphLastEventPtr);
+  if (!MLastEventPtr)
+    MLastEventPtr = std::make_shared<event_impl>(std::nullopt);
+  return detail::createSyclObjFromImpl<event>(MLastEventPtr);
 }
 
 void queue_impl::addEvent(const event &Event) {
@@ -273,55 +297,90 @@ void queue_impl::addSharedEvent(const event &Event) {
   MEventsShared.push_back(Event);
 }
 
-template <typename MemOpFuncT, typename... MemOpArgTs>
+static bool
+areEventsSafeForSchedulerBypass(const std::vector<sycl::event> &DepEvents,
+                                ContextImplPtr Context) {
+  auto CheckEvent = [&Context](const sycl::event &Event) {
+    const EventImplPtr &SyclEventImplPtr = detail::getSyclObjImpl(Event);
+    // Events that don't have an initialized context are throwaway events that
+    // don't represent actual dependencies. Calling getContextImpl() would set
+    // their context, which we wish to avoid as it is expensive.
+    if (!SyclEventImplPtr->isContextInitialized() &&
+        !SyclEventImplPtr->is_host()) {
+      return true;
+    }
+    if (SyclEventImplPtr->is_host()) {
+      return SyclEventImplPtr->isCompleted();
+    }
+    // Cross-context dependencies can't be passed to the backend directly.
+    if (SyclEventImplPtr->getContextImpl() != Context)
+      return false;
+
+    // A nullptr here means that the commmand does not produce a PI event or it
+    // hasn't been enqueued yet.
+    return SyclEventImplPtr->getHandleRef() != nullptr;
+  };
+
+  return std::all_of(DepEvents.begin(), DepEvents.end(),
+                     [&Context, &CheckEvent](const sycl::event &Event) {
+                       return CheckEvent(Event);
+                     });
+}
+
+template <typename HandlerFuncT>
+event queue_impl::submitWithHandler(const std::shared_ptr<queue_impl> &Self,
+                                    const std::vector<event> &DepEvents,
+                                    HandlerFuncT HandlerFunc) {
+  return submit(
+      [&](handler &CGH) {
+        CGH.depends_on(DepEvents);
+        HandlerFunc(CGH);
+      },
+      Self, {});
+}
+
+template <typename HandlerFuncT, typename MemOpFuncT, typename... MemOpArgTs>
 event queue_impl::submitMemOpHelper(const std::shared_ptr<queue_impl> &Self,
                                     const std::vector<event> &DepEvents,
+                                    HandlerFuncT HandlerFunc,
                                     MemOpFuncT MemOpFunc,
                                     MemOpArgTs... MemOpArgs) {
-  if (MHasDiscardEventsSupport) {
-    MemOpFunc(MemOpArgs..., getOrWaitEvents(DepEvents, MContext),
-              /*PiEvent*/ nullptr, /*EventImplPtr*/ nullptr);
-    return createDiscardedEvent();
-  }
-
-  event ResEvent = prepareSYCLEventAssociatedWithQueue(Self);
+  // We need to submit command and update the last event under same lock if we
+  // have in-order queue.
   {
-    // We need to submit command and update the last event under same lock if we
-    // have in-order queue.
-    auto ScopeLock = isInOrder() ? std::unique_lock<std::mutex>(MLastEventMtx)
-                                 : std::unique_lock<std::mutex>();
-    // If the last submitted command in the in-order queue is host_task then
-    // wait for it before submitting usm command.
-    if (isInOrder() && MLastCGType == CG::CGTYPE::CodeplayHostTask)
-      MLastEvent.wait();
+    std::unique_lock<std::mutex> Lock(MMutex, std::defer_lock);
 
     std::vector<event> MutableDepEvents;
     const std::vector<event> &ExpandedDepEvents =
-        getExtendDependencyList(DepEvents, MutableDepEvents);
-
-    auto EventImpl = detail::getSyclObjImpl(ResEvent);
-    MemOpFunc(MemOpArgs..., getOrWaitEvents(ExpandedDepEvents, MContext),
-              &EventImpl->getHandleRef(), EventImpl);
-
-    if (MContext->is_host())
-      return MDiscardEvents ? createDiscardedEvent() : event();
-
-    // When a queue is recorded by a graph, the dependencies are managed in the
-    // graph implementaton. Additionally, CG recorded for a graph are outside of
-    // the in-order queue execution sequence. Therefore, these CG must not
-    // update MLastEvent.
-    if (isInOrder() && (getCommandGraph() == nullptr)) {
-      MLastEvent = ResEvent;
-      // We don't create a command group for usm commands, so set it to None.
-      // This variable is used to perform explicit dependency management when
-      // required.
-      MLastCGType = CG::CGTYPE::None;
+        getExtendDependencyList(DepEvents, MutableDepEvents, Lock);
+
+    if (areEventsSafeForSchedulerBypass(ExpandedDepEvents, MContext)) {
+      if (MHasDiscardEventsSupport) {
+        MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents),
+                  /*PiEvent*/ nullptr, /*EventImplPtr*/ nullptr);
+        return createDiscardedEvent();
+      }
+
+      event ResEvent = prepareSYCLEventAssociatedWithQueue(Self);
+      auto EventImpl = detail::getSyclObjImpl(ResEvent);
+      MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents),
+                &EventImpl->getHandleRef(), EventImpl);
+
+      if (MContext->is_host())
+        return MDiscardEvents ? createDiscardedEvent() : event();
+
+      if (isInOrder()) {
+        auto &EventToStoreIn =
+            MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr;
+        EventToStoreIn = EventImpl;
+      }
+      // Track only if we won't be able to handle it with piQueueFinish.
+      if (MEmulateOOO)
+        addSharedEvent(ResEvent);
+      return discard_or_return(ResEvent);
     }
   }
-  // Track only if we won't be able to handle it with piQueueFinish.
-  if (MEmulateOOO)
-    addSharedEvent(ResEvent);
-  return MDiscardEvents ? createDiscardedEvent() : ResEvent;
+  return submitWithHandler(Self, DepEvents, HandlerFunc);
 }
 
 void *queue_impl::instrumentationProlog(const detail::code_location &CodeLoc,
@@ -495,9 +554,10 @@ bool queue_impl::ext_oneapi_empty() const {
   // If we have in-order queue where events are not discarded then just check
   // the status of the last event.
   if (isInOrder() && !MDiscardEvents) {
-    std::lock_guard<std::mutex> Lock(MLastEventMtx);
-    return MLastEvent.get_info<info::event::command_execution_status>() ==
-           info::event_command_status::complete;
+    std::lock_guard<std::mutex> Lock(MMutex);
+    return !MLastEventPtr ||
+           MLastEventPtr->get_info<info::event::command_execution_status>() ==
+               info::event_command_status::complete;
   }
 
   // Check the status of the backend queue if this is not a host queue.
@@ -533,6 +593,12 @@ bool queue_impl::ext_oneapi_empty() const {
   return true;
 }
 
+event queue_impl::discard_or_return(const event &Event) {
+  if (!(MDiscardEvents))
+    return Event;
+  return createDiscardedEvent();
+}
+
 } // namespace detail
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index 5aab3fe1598ff..722acf925cf8d 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -211,7 +211,7 @@ class queue_impl {
 #endif
   }
 
-  event getLastEvent() const;
+  event getLastEvent();
 
 private:
   void queue_impl_interop(sycl::detail::pi::PiQueue PiQueue) {
@@ -396,12 +396,14 @@ class queue_impl {
                const std::shared_ptr<queue_impl> &SecondQueue,
                const detail::code_location &Loc,
                const SubmitPostProcessF *PostProcess = nullptr) {
+    event ResEvent;
     try {
-      return submit_impl(CGF, Self, Self, SecondQueue, Loc, PostProcess);
+      ResEvent = submit_impl(CGF, Self, Self, SecondQueue, Loc, PostProcess);
     } catch (...) {
-      return SecondQueue->submit_impl(CGF, SecondQueue, Self, SecondQueue, Loc,
-                                      PostProcess);
+      ResEvent = SecondQueue->submit_impl(CGF, SecondQueue, Self, SecondQueue,
+                                          Loc, PostProcess);
     }
+    return discard_or_return(ResEvent);
   }
 
   /// Submits a command group function object to the queue, in order to be
@@ -416,7 +418,8 @@ class queue_impl {
                const std::shared_ptr<queue_impl> &Self,
                const detail::code_location &Loc,
                const SubmitPostProcessF *PostProcess = nullptr) {
-    return submit_impl(CGF, Self, Self, nullptr, Loc, PostProcess);
+    auto ResEvent = submit_impl(CGF, Self, Self, nullptr, Loc, PostProcess);
+    return discard_or_return(ResEvent);
   }
 
   /// Performs a blocking wait for the completion of all enqueued tasks in the
@@ -707,6 +710,7 @@ class queue_impl {
       std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph) {
     std::lock_guard<std::mutex> Lock(MMutex);
     MGraph = Graph;
+    MGraphLastEventPtr = nullptr;
   }
 
   std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>
@@ -730,9 +734,11 @@ class queue_impl {
 
   const std::vector<event> &
   getExtendDependencyList(const std::vector<event> &DepEvents,
-                          std::vector<event> &MutableVec);
+                          std::vector<event> &MutableVec,
+                          std::unique_lock<std::mutex> &QueueLock);
 
 protected:
+  event discard_or_return(const event &Event);
   // Hook to the scheduler to clean up any fusion command held on destruction.
   void cleanup_fusion_cmd();
 
@@ -740,29 +746,22 @@ class queue_impl {
   template <typename HandlerType = handler>
   void finalizeHandler(HandlerType &Handler, const CG::CGTYPE &Type,
                        event &EventRet) {
-    // When a queue is recorded by a graph, the dependencies are managed in the
-    // graph implementaton. Additionally, CG recorded for a graph are outside of
-    // the in-order queue execution sequence. Therefore, these CG must not
-    // update MLastEvent.
-    if (MIsInorder && (getCommandGraph() == nullptr)) {
-
-      auto IsExpDepManaged = [](const CG::CGTYPE &Type) {
-        return Type == CG::CGTYPE::CodeplayHostTask;
-      };
-
+    if (MIsInorder) {
       // Accessing and changing of an event isn't atomic operation.
-      // Hence, here is are locks for thread-safety.
-      std::lock_guard<std::mutex> LastEventLock{MLastEventMtx};
-
-      if (MLastCGType == CG::CGTYPE::None)
-        MLastCGType = Type;
-      // Also handles case when sync model changes. E.g. Last is host, new is
-      // kernel.
-      bool NeedSeparateDependencyMgmt =
-          IsExpDepManaged(Type) || IsExpDepManaged(MLastCGType);
-
-      if (NeedSeparateDependencyMgmt)
-        Handler.depends_on(MLastEvent);
+      // Hence, here is the lock for thread-safety.
+      std::lock_guard<std::mutex> Lock{MMutex};
+      // This dependency is needed for the following purposes:
+      //    - host tasks are handled by the runtime and cannot be implicitly
+      //    synchronized by the backend.
+      //    - to prevent the 2nd kernel enqueue when the 1st kernel is blocked
+      //    by a host task. This dependency allows to build the enqueue order in
+      //    the RT but will not be passed to the backend. See getPIEvents in
+      //    Command.
+      auto &EventToBuildDeps =
+          MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr;
+      if (EventToBuildDeps)
+        Handler.depends_on(
+            createSyclObjFromImpl<sycl::event>(EventToBuildDeps));
 
       // If there is an external event set, add it as a dependency and clear it.
       // We do not need to hold the lock as MLastEventMtx will ensure the last
@@ -772,9 +771,7 @@ class queue_impl {
         Handler.depends_on(*ExternalEvent);
 
       EventRet = Handler.finalize();
-
-      MLastEvent = EventRet;
-      MLastCGType = Type;
+      EventToBuildDeps = getSyclObjImpl(EventRet);
     } else
       EventRet = Handler.finalize();
   }
@@ -844,20 +841,35 @@ class queue_impl {
     return Event;
   }
 
-  /// Performs direct submission of a memory operation.
+  /// Helper function for submitting a memory operation with a handler.
+  /// \param Self is a shared_ptr to this queue.
+  /// \param DepEvents is a vector of dependencies of the operation.
+  /// \param HandlerFunc is a function that submits the operation with a
+  ///        handler.
+  template <typename HandlerFuncT>
+  event submitWithHandler(const std::shared_ptr<queue_impl> &Self,
+                          const std::vector<event> &DepEvents,
+                          HandlerFuncT HandlerFunc);
+
+  /// Performs submission of a memory operation directly if scheduler can be
+  /// bypassed, or with a handler otherwise.
   ///
   /// \param Self is a shared_ptr to this queue.
   /// \param DepEvents is a vector of dependencies of the operation.
-  /// \param MemOpFunc is a function that forwards its arguments to the
+  /// \param HandlerFunc is a function that submits the operation with a
+  ///        handler.
+  /// \param MemMngrFunc is a function that forwards its arguments to the
   ///        appropriate memory manager function.
-  /// \param MemOpArgs are all the arguments that need to be passed to memory
+  /// \param MemMngrArgs are all the arguments that need to be passed to memory
   ///        manager except the last three: dependencies, PI event and
   ///        EventImplPtr are filled out by this helper.
   /// \return an event representing the submitted operation.
-  template <typename MemOpFuncT, typename... MemOpArgTs>
+  template <typename HandlerFuncT, typename MemMngrFuncT,
+            typename... MemMngrArgTs>
   event submitMemOpHelper(const std::shared_ptr<queue_impl> &Self,
                           const std::vector<event> &DepEvents,
-                          MemOpFuncT MemOpFunc, MemOpArgTs... MemOpArgs);
+                          HandlerFuncT HandlerFunc, MemMngrFuncT MemMngrFunc,
+                          MemMngrArgTs... MemOpArgs);
 
   // When instrumentation is enabled emits trace event for wait begin and
   // returns the telemetry event generated for the wait
@@ -911,13 +923,12 @@ class queue_impl {
   buffer<AssertHappened, 1> MAssertHappenedBuffer;
 
   // This event is employed for enhanced dependency tracking with in-order queue
-  // Access to the event should be guarded with MLastEventMtx
-  event MLastEvent;
-  mutable std::mutex MLastEventMtx;
-  // Used for in-order queues in pair with MLastEvent
-  // Host tasks are explicitly synchronized in RT, pi tasks - implicitly by
-  // backend. Using type to setup explicit sync between host and pi tasks.
-  CG::CGTYPE MLastCGType = CG::CGTYPE::None;
+  // Access to the event should be guarded with MMutex
+  EventImplPtr MLastEventPtr;
+  // Same as above but for graph begin-end recording cycle.
+  // Track deps within graph commands separately.
+  // Protected by common queue object mutex MMutex.
+  EventImplPtr MGraphLastEventPtr;
 
   const bool MIsInorder;
 
diff --git a/sycl/source/detail/scheduler/graph_processor.cpp b/sycl/source/detail/scheduler/graph_processor.cpp
index f22913f990e9c..5ff184be64948 100644
--- a/sycl/source/detail/scheduler/graph_processor.cpp
+++ b/sycl/source/detail/scheduler/graph_processor.cpp
@@ -53,16 +53,6 @@ bool Scheduler::GraphProcessor::handleBlockingCmd(Command *Cmd,
                                                   BlockingT Blocking) {
   if (Cmd == RootCommand || Blocking)
     return true;
-  // Async kernel enqueue depending on host task is not compatible with in order
-  // queue. If we have host_task_1, kernel_2 depending on host_task_1 and
-  // kernel_3 without explicit dependencies submitted to in order queue: host
-  // task blocks kernel_2 from being enqueued while kernel_3 has no such
-  // dependencies so in current impl it could be enqueued earlier that kernel_2.
-  // That makes it impossible to use this path with blocking users for in order
-  // queue.
-  if (QueueImplPtr Queue = RootCommand->getEvent()->getSubmittedQueue();
-      Queue && Queue->isInOrder())
-    return true;
   {
     std::lock_guard<std::mutex> Guard(Cmd->MBlockedUsersMutex);
     if (Cmd->isBlocking()) {
diff --git a/sycl/test-e2e/Plugin/level_zero_barrier_optimization.cpp b/sycl/test-e2e/Plugin/level_zero_barrier_optimization.cpp
index d46fe6c80cc33..91171d6ce0dd1 100644
--- a/sycl/test-e2e/Plugin/level_zero_barrier_optimization.cpp
+++ b/sycl/test-e2e/Plugin/level_zero_barrier_optimization.cpp
@@ -54,7 +54,7 @@ int main() {
 
     // CHECK: Test2
     // CHECK: ---> piEnqueueEventsWaitWithBarrier(
-    // CHECK: ZE ---> zeEventCreate
+    // CHECK: ZE ---> {{zeEventCreate|zeEventHostReset}}
     // CHECK: ZE ---> zeCommandListAppendWaitOnEvents
     // CHECK: ZE ---> zeCommandListAppendSignalEvent
     // CHECK: ) ---> 	pi_result : PI_SUCCESS
diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp
index b8d45404c627d..ce7bb02e278b2 100644
--- a/sycl/unittests/Extensions/CommandGraph.cpp
+++ b/sycl/unittests/Extensions/CommandGraph.cpp
@@ -1199,9 +1199,11 @@ TEST_F(CommandGraphTest, InOrderQueueHostTaskAndGraph) {
       [&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
   auto EventLastImpl = sycl::detail::getSyclObjImpl(EventLast);
   auto EventLastWaitList = EventLastImpl->getWaitList();
-  // Previous task is not a host task. In Order queue dependency are managed by
-  // the backend for non-host kernels.
-  ASSERT_EQ(EventLastWaitList.size(), 0lu);
+  // Previous task is not a host task. Explicit dependency is still needed
+  // to properly handle blocked tasks (the event will be filtered out before
+  // submission to the backend).
+  ASSERT_EQ(EventLastWaitList.size(), 1lu);
+  ASSERT_EQ(EventLastWaitList[0], EventGraphImpl);
 }
 
 TEST_F(CommandGraphTest, InOrderQueueMemsetAndGraph) {
diff --git a/sycl/unittests/scheduler/AllocaLinking.cpp b/sycl/unittests/scheduler/AllocaLinking.cpp
index 0cce7dcd2ea92..a77995a203da3 100644
--- a/sycl/unittests/scheduler/AllocaLinking.cpp
+++ b/sycl/unittests/scheduler/AllocaLinking.cpp
@@ -46,6 +46,7 @@ static pi_result redefinedDeviceGetInfoAfter(pi_device Device,
 }
 
 TEST_F(SchedulerTest, AllocaLinking) {
+  HostUnifiedMemory = false;
   // This host device constructor should be placed before Mock.redefine
   // because it overrides the real implementation of get_device_info
   // which is needed when creating a host device.
diff --git a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp
index 020a956537ddd..37bb5106d75b2 100644
--- a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp
+++ b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp
@@ -11,13 +11,14 @@
 
 #include <helpers/PiImage.hpp>
 #include <helpers/PiMock.hpp>
+#include <helpers/TestKernel.hpp>
 
 #include <detail/event_impl.hpp>
 #include <detail/queue_impl.hpp>
 
 #include <gtest/gtest.h>
 
-#include <helpers/TestKernel.hpp>
+#include <sycl/sycl.hpp>
 
 using namespace sycl;
 
@@ -32,6 +33,7 @@ inline pi_result redefinedEventsWait(pi_uint32 num_events,
 }
 
 TEST_F(SchedulerTest, InOrderQueueHostTaskDeps) {
+  GEventsWaitCounter = 0;
   sycl::unittest::PiMock Mock;
   sycl::platform Plt = Mock.getPlatform();
   Mock.redefineBefore<detail::PiApiKind::piEventsWait>(redefinedEventsWait);
@@ -39,16 +41,131 @@ TEST_F(SchedulerTest, InOrderQueueHostTaskDeps) {
   context Ctx{Plt};
   queue InOrderQueue{Ctx, default_selector_v, property::queue::in_order()};
 
+  auto buf = sycl::malloc_shared<int>(1, InOrderQueue);
+  event Evt = InOrderQueue.submit(
+      [&](sycl::handler &CGH) { CGH.memset(buf, 0, sizeof(buf[0])); });
+  InOrderQueue.submit([&](sycl::handler &CGH) { CGH.host_task([=] {}); })
+      .wait();
+
+  EXPECT_EQ(GEventsWaitCounter, 1u);
+}
+
+enum class CommandType { KERNEL = 1, MEMSET = 2 };
+std::vector<std::pair<CommandType, size_t>> ExecutedCommands;
+
+inline pi_result customEnqueueKernelLaunch(pi_queue, pi_kernel, pi_uint32,
+                                           const size_t *, const size_t *,
+                                           const size_t *,
+                                           pi_uint32 EventsCount,
+                                           const pi_event *, pi_event *) {
+  ExecutedCommands.push_back({CommandType::KERNEL, EventsCount});
+  return PI_SUCCESS;
+}
+inline pi_result customextUSMEnqueueMemset(pi_queue, void *, pi_int32, size_t,
+                                           pi_uint32 EventsCount,
+                                           const pi_event *, pi_event *) {
+  ExecutedCommands.push_back({CommandType::MEMSET, EventsCount});
+  return PI_SUCCESS;
+}
+
+TEST_F(SchedulerTest, InOrderQueueCrossDeps) {
+  ExecutedCommands.clear();
+  sycl::unittest::PiMock Mock;
+  Mock.redefineBefore<detail::PiApiKind::piEnqueueKernelLaunch>(
+      customEnqueueKernelLaunch);
+  Mock.redefineBefore<detail::PiApiKind::piextUSMEnqueueMemset>(
+      customextUSMEnqueueMemset);
+
+  sycl::platform Plt = Mock.getPlatform();
+
+  context Ctx{Plt};
+  queue InOrderQueue{Ctx, default_selector_v, property::queue::in_order()};
+
   kernel_bundle KernelBundle =
       sycl::get_kernel_bundle<sycl::bundle_state::input>(Ctx);
   auto ExecBundle = sycl::build(KernelBundle);
 
-  event Evt = InOrderQueue.submit([&](sycl::handler &CGH) {
+  std::mutex CvMutex;
+  std::condition_variable Cv;
+  bool ready = false;
+
+  InOrderQueue.submit([&](sycl::handler &CGH) {
+    CGH.host_task([&] {
+      std::unique_lock lk(CvMutex);
+      Cv.wait(lk, [&ready] { return ready; });
+    });
+  });
+
+  auto buf = sycl::malloc_shared<int>(1, InOrderQueue);
+
+  event Ev1 = InOrderQueue.submit(
+      [&](sycl::handler &CGH) { CGH.memset(buf, 0, sizeof(buf[0])); });
+
+  event Ev2 = InOrderQueue.submit([&](sycl::handler &CGH) {
     CGH.use_kernel_bundle(ExecBundle);
     CGH.single_task<TestKernel<>>([] {});
   });
-  InOrderQueue.submit([&](sycl::handler &CGH) { CGH.host_task([=] {}); })
-      .wait();
 
-  EXPECT_TRUE(GEventsWaitCounter == 1);
+  {
+    std::unique_lock lk(CvMutex);
+    ready = true;
+  }
+  Cv.notify_one();
+
+  InOrderQueue.wait();
+
+  ASSERT_EQ(ExecutedCommands.size(), 2u);
+  EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, CommandType::MEMSET);
+  EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u);
+  EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL);
+  EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u);
 }
+
+TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) {
+  ExecutedCommands.clear();
+  sycl::unittest::PiMock Mock;
+  Mock.redefineBefore<detail::PiApiKind::piEnqueueKernelLaunch>(
+      customEnqueueKernelLaunch);
+  Mock.redefineBefore<detail::PiApiKind::piextUSMEnqueueMemset>(
+      customextUSMEnqueueMemset);
+
+  sycl::platform Plt = Mock.getPlatform();
+  if (Plt.is_host()) {
+    std::cout << "Not run due to host-only environment\n";
+    GTEST_SKIP();
+  }
+
+  context Ctx{Plt};
+  queue InOrderQueue{Ctx, default_selector_v, property::queue::in_order()};
+
+  std::mutex CvMutex;
+  std::condition_variable Cv;
+  bool ready = false;
+
+  InOrderQueue.submit([&](sycl::handler &CGH) {
+    CGH.host_task([&] {
+      std::unique_lock lk(CvMutex);
+      Cv.wait(lk, [&ready] { return ready; });
+    });
+  });
+
+  auto buf = sycl::malloc_shared<int>(1, InOrderQueue);
+
+  event Ev1 = InOrderQueue.memset(buf, 0, sizeof(buf[0]));
+
+  event Ev2 = InOrderQueue.single_task<TestKernel<>>([] {});
+
+  {
+    std::unique_lock lk(CvMutex);
+    ready = true;
+  }
+  Cv.notify_one();
+
+  InOrderQueue.wait();
+
+  ASSERT_EQ(ExecutedCommands.size(), 2u);
+  EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, CommandType::MEMSET);
+  EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u);
+  EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL);
+  EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u);
+}
\ No newline at end of file
diff --git a/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp b/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp
index ca1e7d664bf58..4b50791981258 100644
--- a/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp
+++ b/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp
@@ -61,14 +61,17 @@ TEST_F(SchedulerTest, InOrderQueueSyncCheck) {
   //  host      | yes - always, separate sync management
   //  host      | yes - always, separate sync management
   //  kernel    | yes - change of sync approach
-  //  kernel    | no  - sync between pi calls must be done by backend
+  //  kernel    | yes - sync between pi calls must be done by backend, but we
+  //  still add dependency to handle the right order due to host task. This
+  //  dependency will not be sent to backend. It is checked in
+  //  SchedulerTest.InOrderQueueCrossDeps
   //  host      | yes - always, separate sync management
 
   sycl::event Event;
   // host task
   {
     LimitedHandlerSimulation MockCGH;
-    EXPECT_CALL(MockCGH, depends_on).Times(1);
+    EXPECT_CALL(MockCGH, depends_on).Times(0);
     Queue->finalizeHandler<LimitedHandlerSimulation>(
         MockCGH, detail::CG::CGTYPE::CodeplayHostTask, Event);
   }
@@ -89,7 +92,7 @@ TEST_F(SchedulerTest, InOrderQueueSyncCheck) {
   // kernel task
   {
     LimitedHandlerSimulation MockCGH;
-    EXPECT_CALL(MockCGH, depends_on).Times(0);
+    EXPECT_CALL(MockCGH, depends_on).Times(1);
     Queue->finalizeHandler<LimitedHandlerSimulation>(
         MockCGH, detail::CG::CGTYPE::Kernel, Event);
   }