diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md
index 91b2b0e22eb1c..318a0b9827fe2 100644
--- a/sycl/doc/EnvironmentVariables.md
+++ b/sycl/doc/EnvironmentVariables.md
@@ -23,6 +23,7 @@ subject to change. Do not rely on these variables in production code.
 | SYCL_THROW_ON_BLOCK | Any(\*) | Throw an exception on attempt to wait for a blocked command.  |
 | SYCL_DEVICELIB_INHIBIT_NATIVE | String of device library extensions (separated by a whitespace) | Do not rely on device native support for devicelib extensions listed in this option. |
 | SYCL_DEVICE_ALLOWLIST | A list of devices and their minimum driver version following the pattern: DeviceName:{{XXX}},DriverVersion:{{X.Y.Z.W}}. Also may contain PlatformName and PlatformVersion | Filter out devices that do not match the pattern specified. Regular expression can be passed and the DPC++ runtime will select only those devices which satisfy the regex. |
+| SYCL_QUEUE_THREAD_POOL_SIZE | Positive integer | Number of threads in thread pool of queue. |
 `(*) Note: Any means this environment variable is effective when set to any non-null value.`
 
 ### SYCL_PRINT_EXECUTION_GRAPH Options
diff --git a/sycl/include/CL/sycl/accessor.hpp b/sycl/include/CL/sycl/accessor.hpp
index b2c2ec7af60d0..97feaa283182c 100644
--- a/sycl/include/CL/sycl/accessor.hpp
+++ b/sycl/include/CL/sycl/accessor.hpp
@@ -778,7 +778,7 @@ class accessor :
   template <int Dims = Dimensions, typename AllocatorT,
 	   typename = typename detail::enable_if_t<
 		   (Dims == 0) && 
-                    (!IsPlaceH && (IsGlobalBuf || IsConstantBuf))>
+                    (!IsPlaceH && (IsGlobalBuf || IsConstantBuf || IsHostBuf))>
 		    			>
   accessor(buffer<DataT,1,AllocatorT> &BufferRef,
 		  handler &CommandGroupHandler)
@@ -817,9 +817,9 @@ class accessor :
 #endif
 
   template <int Dims = Dimensions, typename AllocatorT,
-            typename = detail::enable_if_t<(Dims > 0) && (Dims == Dimensions) &&
-                                           (!IsPlaceH &&
-                                            (IsGlobalBuf || IsConstantBuf))>>
+            typename = detail::enable_if_t<
+                (Dims > 0) && (Dims == Dimensions) &&
+                (!IsPlaceH && (IsGlobalBuf || IsConstantBuf || IsHostBuf))>>
   accessor(buffer<DataT, Dims, AllocatorT> &BufferRef,
            handler &CommandGroupHandler)
 #ifdef __SYCL_DEVICE_ONLY__
diff --git a/sycl/include/CL/sycl/detail/cg.hpp b/sycl/include/CL/sycl/detail/cg.hpp
index 100c29e8ebe38..26ee7b6f89837 100644
--- a/sycl/include/CL/sycl/detail/cg.hpp
+++ b/sycl/include/CL/sycl/detail/cg.hpp
@@ -215,6 +215,16 @@ class InteropTask {
   void call(cl::sycl::interop_handler &h) { MFunc(h); }
 };
 
+class HostTask {
+  std::function<void()> MHostTask;
+
+public:
+  HostTask() : MHostTask([]() {}) {}
+  HostTask(std::function<void()> &&Func) : MHostTask(Func) {}
+
+  void call() { MHostTask(); }
+};
+
 // Class which stores specific lambda object.
 template <class KernelType, class KernelArgType, int Dims>
 class HostKernel : public HostKernelBase {
@@ -391,7 +401,8 @@ class CG {
     COPY_USM,
     FILL_USM,
     PREFETCH_USM,
-    INTEROP_TASK_CODEPLAY
+    CODEPLAY_INTEROP_TASK,
+    CODEPLAY_HOST_TASK
   };
 
   CG(CGTYPE Type, vector_class<vector_class<char>> ArgsStorage,
@@ -631,6 +642,24 @@ class CGInteropTask : public CG {
         MInteropTask(std::move(InteropTask)) {}
 };
 
+class CGHostTask : public CG {
+public:
+  std::unique_ptr<HostTask> MHostTask;
+  vector_class<ArgDesc> MArgs;
+
+  CGHostTask(std::unique_ptr<HostTask> HostTask, vector_class<ArgDesc> Args,
+             std::vector<std::vector<char>> ArgsStorage,
+             std::vector<detail::AccessorImplPtr> AccStorage,
+             std::vector<std::shared_ptr<const void>> SharedPtrStorage,
+             std::vector<Requirement *> Requirements,
+             std::vector<detail::EventImplPtr> Events, CGTYPE Type,
+             detail::code_location loc = {})
+      : CG(Type, std::move(ArgsStorage), std::move(AccStorage),
+           std::move(SharedPtrStorage), std::move(Requirements),
+           std::move(Events), std::move(loc)),
+        MHostTask(std::move(HostTask)), MArgs(std::move(Args)) {}
+};
+
 } // namespace detail
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index a2c24ee63febb..d08816bd56206 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -105,6 +105,27 @@ template <typename Type> struct get_kernel_name_t<detail::auto_name, Type> {
   using name = Type;
 };
 
+template <typename, typename T> struct check_fn_signature {
+  static_assert(std::integral_constant<T, false>::value,
+                "Second template parameter is required to be of function type");
+};
+
+template <typename F, typename RetT, typename... Args>
+struct check_fn_signature<F, RetT(Args...)> {
+private:
+  template <typename T>
+  static constexpr auto check(T *) -> typename std::is_same<
+      decltype(std::declval<T>().operator()(std::declval<Args>()...)),
+      RetT>::type;
+
+  template <typename> static constexpr std::false_type check(...);
+
+  using type = decltype(check<F>(0));
+
+public:
+  static constexpr bool value = type::value;
+};
+
 __SYCL_EXPORT device getDeviceFromHandler(handler &);
 
 } // namespace detail
@@ -802,6 +823,20 @@ class __SYCL_EXPORT handler {
     MCGType = detail::CG::RUN_ON_HOST_INTEL;
   }
 
+  template <typename FuncT>
+  typename std::enable_if<detail::check_fn_signature<
+      typename std::remove_reference<FuncT>::type, void()>::value>::type
+  codeplay_host_task(FuncT Func) {
+    throwIfActionIsCreated();
+
+    MNDRDesc.set(range<1>(1));
+    MArgs = std::move(MAssociatedAccesors);
+
+    MHostTask.reset(new detail::HostTask(std::move(Func)));
+
+    MCGType = detail::CG::CODEPLAY_HOST_TASK;
+  }
+
   /// Defines and invokes a SYCL kernel function for the specified range and
   /// offset.
   ///
@@ -1128,7 +1163,7 @@ class __SYCL_EXPORT handler {
   template <typename FuncT> void interop_task(FuncT Func) {
 
     MInteropTask.reset(new detail::InteropTask(std::move(Func)));
-    MCGType = detail::CG::INTEROP_TASK_CODEPLAY;
+    MCGType = detail::CG::CODEPLAY_INTEROP_TASK;
   }
 
   /// Defines and invokes a SYCL kernel function for the specified range.
@@ -1586,6 +1621,8 @@ class __SYCL_EXPORT handler {
   vector_class<char> MPattern;
   /// Storage for a lambda or function object.
   unique_ptr_class<detail::HostKernelBase> MHostKernel;
+  /// Storage for lambda/function when using HostTask
+  unique_ptr_class<detail::HostTask> MHostTask;
   detail::OSModuleHandle MOSModuleHandle;
   // Storage for a lambda or function when using InteropTasks
   std::unique_ptr<detail::InteropTask> MInteropTask;
diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp
index f3263720a2bb4..313ef01472d98 100644
--- a/sycl/source/detail/event_impl.cpp
+++ b/sycl/source/detail/event_impl.cpp
@@ -50,11 +50,31 @@ event_impl::~event_impl() {
 }
 
 void event_impl::waitInternal() const {
-  if (!MHostEvent) {
+  if (!MHostEvent && MEvent) {
     getPlugin().call<PiApiKind::piEventsWait>(1, &MEvent);
+    return;
   }
-  // Waiting of host events is NOP so far as all operations on host device
-  // are blocking.
+
+  while (MState != HES_Complete)
+    ;
+}
+
+void event_impl::setComplete() {
+  if (MHostEvent || !MEvent) {
+#ifndef NDEBUG
+    int Expected = HES_NotComplete;
+    int Desired = HES_Complete;
+
+    bool Succeeded = MState.compare_exchange_strong(Expected, Desired);
+
+    assert(Succeeded && "Unexpected state of event");
+#else
+    MState.store(static_cast<int>(HES_Complete));
+#endif
+    return;
+  }
+
+  assert(false && "setComplete is not supported for non-host event");
 }
 
 const RT::PiEvent &event_impl::getHandleRef() const { return MEvent; }
@@ -68,11 +88,15 @@ void event_impl::setContextImpl(const ContextImplPtr &Context) {
   MHostEvent = Context->is_host();
   MOpenCLInterop = !MHostEvent;
   MContext = Context;
+
+  MState = HES_NotComplete;
 }
 
+event_impl::event_impl() : MState(HES_Complete) {}
+
 event_impl::event_impl(RT::PiEvent Event, const context &SyclContext)
     : MEvent(Event), MContext(detail::getSyclObjImpl(SyclContext)),
-      MOpenCLInterop(true), MHostEvent(false) {
+      MOpenCLInterop(true), MHostEvent(false), MState(HES_Complete) {
 
   if (MContext->is_host()) {
     throw cl::sycl::invalid_parameter_error(
@@ -96,12 +120,19 @@ event_impl::event_impl(RT::PiEvent Event, const context &SyclContext)
 }
 
 event_impl::event_impl(QueueImplPtr Queue) : MQueue(Queue) {
-  if (Queue->is_host() &&
-      Queue->has_property<property::queue::enable_profiling>()) {
-    MHostProfilingInfo.reset(new HostProfilingInfo());
-    if (!MHostProfilingInfo)
-      throw runtime_error("Out of host memory", PI_OUT_OF_HOST_MEMORY);
+  if (Queue->is_host()) {
+    MState.store(HES_NotComplete);
+
+    if (Queue->has_property<property::queue::enable_profiling>()) {
+      MHostProfilingInfo.reset(new HostProfilingInfo());
+      if (!MHostProfilingInfo)
+        throw runtime_error("Out of host memory", PI_OUT_OF_HOST_MEMORY);
+    }
+
+    return;
   }
+
+  MState.store(HES_Complete);
 }
 
 void *event_impl::instrumentationProlog(string_class &Name, int32_t StreamID,
diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp
index 94600f5eb6f9f..58616e13a12d2 100644
--- a/sycl/source/detail/event_impl.hpp
+++ b/sycl/source/detail/event_impl.hpp
@@ -14,6 +14,7 @@
 #include <CL/sycl/info/info_desc.hpp>
 #include <CL/sycl/stl.hpp>
 
+#include <atomic>
 #include <cassert>
 
 __SYCL_INLINE_NAMESPACE(cl) {
@@ -32,7 +33,7 @@ class event_impl {
   /// Constructs a ready SYCL event.
   ///
   /// If the constructed SYCL event is waited on it will complete immediately.
-  event_impl() = default;
+  event_impl();
   /// Constructs an event instance from a plug-in event handle.
   ///
   /// The SyclContext must match the plug-in context associated with the
@@ -166,6 +167,13 @@ class event_impl {
   bool MHostEvent = true;
   std::unique_ptr<HostProfilingInfo> MHostProfilingInfo;
   void *MCommand = nullptr;
+
+  enum HostEventState : int { HES_NotComplete = 0, HES_Complete };
+
+  // State of host event. Employed only for host events and event with no
+  // backend's representation (e.g. alloca). Used values are listed in
+  // HostEventState enum.
+  std::atomic<int> MState;
 };
 
 } // namespace detail
diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index da07df6b949db..f17807f36c687 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -195,6 +195,30 @@ void queue_impl::wait(const detail::code_location &CodeLoc) {
 #endif
 }
 
+void queue_impl::initHostTaskAndEventCallbackThreadPool() {
+  if (MHostTaskThreadPool)
+    return;
+
+  int Size = 1;
+
+  if (const char *val = std::getenv("SYCL_QUEUE_THREAD_POOL_SIZE"))
+    try {
+      Size = std::stoi(val);
+    } catch (...) {
+      throw invalid_parameter_error(
+          "Invalid value for SYCL_QUEUE_THREAD_POOL_SIZE environment variable",
+          PI_INVALID_VALUE);
+    }
+
+  if (Size < 1)
+    throw invalid_parameter_error(
+        "Invalid value for SYCL_QUEUE_THREAD_POOL_SIZE environment variable",
+        PI_INVALID_VALUE);
+
+  MHostTaskThreadPool.reset(new ThreadPool(Size));
+  MHostTaskThreadPool->start();
+}
+
 pi_native_handle queue_impl::getNative() const {
   auto Plugin = getPlugin();
   pi_native_handle Handle;
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index 64667cc9c5569..367bd95746614 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -21,6 +21,7 @@
 #include <detail/event_impl.hpp>
 #include <detail/plugin.hpp>
 #include <detail/scheduler/scheduler.hpp>
+#include <detail/thread_pool.hpp>
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
@@ -348,6 +349,13 @@ class queue_impl {
     MExceptions.PushBack(ExceptionPtr);
   }
 
+  ThreadPool &getThreadPool() {
+    if (!MHostTaskThreadPool)
+      initHostTaskAndEventCallbackThreadPool();
+
+    return *MHostTaskThreadPool;
+  }
+
   /// Gets the native handle of the SYCL queue.
   ///
   /// \return a native handle.
@@ -380,6 +388,8 @@ class queue_impl {
   void instrumentationEpilog(void *TelementryEvent, string_class &Name,
                              int32_t StreamID, uint64_t IId);
 
+  void initHostTaskAndEventCallbackThreadPool();
+
   /// Stores a USM operation event that should be associated with the queue
   ///
   /// \param Event is the event to be stored
@@ -414,6 +424,10 @@ class queue_impl {
   const bool MOpenCLInterop = false;
   // Assume OOO support by default.
   bool MSupportOOO = true;
+
+  // Thread pool for host task and event callbacks execution.
+  // The thread pool is instantiated upon the very first call to getThreadPool()
+  std::unique_ptr<ThreadPool> MHostTaskThreadPool;
 };
 
 } // namespace detail
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 1e5f1c8e4835a..eb40bcdf82044 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -41,6 +41,7 @@
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 namespace detail {
+
 #ifdef XPTI_ENABLE_INSTRUMENTATION
 // Global graph for the application
 extern xpti::trace_event_data_t *GSYCLGraphEvent;
@@ -156,58 +157,79 @@ getPiEvents(const std::vector<EventImplPtr> &EventImpls) {
   return RetPiEvents;
 }
 
-void EventCompletionClbk(RT::PiEvent, pi_int32, void *data) {
-  // TODO: Handle return values. Store errors to async handler.
-  EventImplPtr *Event = (reinterpret_cast<EventImplPtr *>(data));
-  RT::PiEvent &EventHandle = (*Event)->getHandleRef();
-  const detail::plugin &Plugin = (*Event)->getPlugin();
-  Plugin.call<PiApiKind::piEventSetStatus>(EventHandle, PI_EVENT_COMPLETE);
-  delete (Event);
-}
+class DispatchHostTask {
+  ExecCGCommand *MThisCmd;
 
-// Method prepares PI event's from list sycl::event's
-std::vector<EventImplPtr> Command::prepareEvents(ContextImplPtr Context) {
-  std::vector<EventImplPtr> Result;
-  std::vector<EventImplPtr> GlueEvents;
-  for (EventImplPtr &DepEvent : MDepsEvents) {
-    // Async work is not supported for host device.
-    if (DepEvent->is_host()) {
-      DepEvent->waitInternal();
-      continue;
+  void waitForEvents() const {
+    std::map<const detail::plugin *, std::vector<EventImplPtr>>
+        RequiredEventsPerPlugin;
+
+    for (const EventImplPtr &Event : MThisCmd->MPreparedDepsEvents) {
+      const detail::plugin &Plugin = Event->getPlugin();
+      RequiredEventsPerPlugin[&Plugin].push_back(Event);
     }
-    // The event handle can be null in case of, for example, alloca command,
-    // which is currently synchrounious, so don't generate OpenCL event.
-    if (DepEvent->getHandleRef() == nullptr) {
-      continue;
+
+    // wait for dependency device events
+    // FIXME Current implementation of waiting for events will make the thread
+    // 'sleep' until all of dependency events are complete. We need a bit more
+    // sophisticated waiting mechanism to allow to utilize this thread for any
+    // other available job and resume once all required events are ready.
+    for (auto &PluginWithEvents : RequiredEventsPerPlugin) {
+      std::vector<RT::PiEvent> RawEvents = getPiEvents(PluginWithEvents.second);
+      PluginWithEvents.first->call<PiApiKind::piEventsWait>(RawEvents.size(),
+                                                            RawEvents.data());
     }
-    ContextImplPtr DepEventContext = DepEvent->getContextImpl();
-
-    // If contexts don't match - connect them using user event
-    if (DepEventContext != Context && !Context->is_host()) {
-      EventImplPtr GlueEvent(new detail::event_impl());
-      GlueEvent->setContextImpl(Context);
-      EventImplPtr *GlueEventCopy =
-          new EventImplPtr(GlueEvent); // To increase the reference count by 1.
-
-      RT::PiEvent &GlueEventHandle = GlueEvent->getHandleRef();
-      auto Plugin = Context->getPlugin();
-      auto DepPlugin = DepEventContext->getPlugin();
-      // Add an event on the current context that
-      // is triggered when the DepEvent is complete
-      Plugin.call<PiApiKind::piEventCreate>(Context->getHandleRef(),
-                                            &GlueEventHandle);
-
-      DepPlugin.call<PiApiKind::piEventSetCallback>(
-          DepEvent->getHandleRef(), PI_EVENT_COMPLETE, EventCompletionClbk,
-          /*void *data=*/(GlueEventCopy));
-      GlueEvents.push_back(GlueEvent);
-      Result.push_back(std::move(GlueEvent));
-      continue;
+
+    // wait for dependency host events
+    for (const EventImplPtr &Event : MThisCmd->MPreparedHostDepsEvents) {
+      Event->waitInternal();
     }
-    Result.push_back(DepEvent);
   }
-  MDepsEvents.insert(MDepsEvents.end(), GlueEvents.begin(), GlueEvents.end());
-  return Result;
+
+public:
+  DispatchHostTask(ExecCGCommand *ThisCmd) : MThisCmd{ThisCmd} {}
+
+  void operator()() const {
+    waitForEvents();
+
+    assert(MThisCmd->getCG().getType() == CG::CGTYPE::CODEPLAY_HOST_TASK);
+
+    CGHostTask &HostTask = static_cast<CGHostTask &>(MThisCmd->getCG());
+
+    // we're ready to call the user-defined lambda now
+    HostTask.MHostTask->call();
+    HostTask.MHostTask.reset();
+
+    // unblock user empty command here
+    EmptyCommand *EmptyCmd = MThisCmd->MEmptyCmd;
+    assert(EmptyCmd && "No empty command found");
+
+    // Completing command's event along with unblocking enqueue readiness of
+    // empty command may lead to quick deallocation of MThisCmd by some cleanup
+    // process. Thus we'll copy deps prior to completing of event and unblocking
+    // of empty command.
+    // Also, it's possible to have record deallocated prior to enqueue process.
+    // Thus we employ read-lock of graph.
+    {
+      Scheduler &Sched = Scheduler::getInstance();
+      std::shared_lock<std::shared_timed_mutex> Lock(Sched.MGraphLock);
+
+      std::vector<DepDesc> Deps = MThisCmd->MDeps;
+
+      // update self-event status
+      MThisCmd->MEvent->setComplete();
+
+      EmptyCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueReady;
+
+      for (const DepDesc &Dep : Deps)
+        Scheduler::enqueueLeavesOfReqUnlocked(Dep.MDepRequirement);
+    }
+  }
+};
+
+void Command::waitForPreparedHostEvents() const {
+  for (const EventImplPtr &HostEvent : MPreparedHostDepsEvents)
+    HostEvent->waitInternal();
 }
 
 void Command::waitForEvents(QueueImplPtr Queue,
@@ -403,9 +425,37 @@ void Command::makeTraceEventEpilog() {
 #endif
 }
 
+void Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep) {
+  const ContextImplPtr &Context = getContext();
+
+  // 1. Async work is not supported for host device.
+  // 2. The event handle can be null in case of, for example, alloca command,
+  //    which is currently synchrounious, so don't generate OpenCL event.
+  //    Though, this event isn't host one as it's context isn't host one.
+  if (DepEvent->is_host() || DepEvent->getHandleRef() == nullptr) {
+    // call to waitInternal() is in waitForPreparedHostEvents() as it's called
+    // from enqueue process functions
+    MPreparedHostDepsEvents.push_back(DepEvent);
+    return;
+  }
+
+  ContextImplPtr DepEventContext = DepEvent->getContextImpl();
+  // If contexts don't match we'll connect them using host task
+  if (DepEventContext != Context && !Context->is_host()) {
+    Scheduler::GraphBuilder &GB = Scheduler::getInstance().MGraphBuilder;
+    GB.connectDepEvent(this, DepEvent, Dep);
+  } else
+    MPreparedDepsEvents.push_back(std::move(DepEvent));
+}
+
+ContextImplPtr Command::getContext() const {
+  return detail::getSyclObjImpl(MQueue->get_context());
+}
+
 void Command::addDep(DepDesc NewDep) {
-  if (NewDep.MDepCommand)
-    MDepsEvents.push_back(NewDep.MDepCommand->getEvent());
+  if (NewDep.MDepCommand) {
+    processDepEvent(NewDep.MDepCommand->getEvent(), NewDep);
+  }
   MDeps.push_back(NewDep);
 #ifdef XPTI_ENABLE_INSTRUMENTATION
   emitEdgeEventForCommandDependence(
@@ -424,7 +474,7 @@ void Command::addDep(EventImplPtr Event) {
   emitEdgeEventForEventDependence(Cmd, PiEventAddr);
 #endif
 
-  MDepsEvents.push_back(std::move(Event));
+  processDepEvent(std::move(Event), DepDesc{nullptr, nullptr, nullptr});
 }
 
 void Command::emitEnqueuedEventSignal(RT::PiEvent &PiEventAddr) {
@@ -466,7 +516,7 @@ bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking) {
     if (ThrowOnBlock)
       throw sycl::runtime_error(
           std::string("Waiting for blocked command. Block reason: ") +
-              std::string(MBlockReason),
+              std::string(getBlockReason()),
           PI_INVALID_OPERATION);
 
 #ifdef XPTI_ENABLE_INSTRUMENTATION
@@ -474,7 +524,7 @@ bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking) {
     // event, which models the barrier while enqueuing along with the blocked
     // reason, as determined by the scheduler
     std::string Info = "enqueue.barrier[";
-    Info += std::string(MBlockReason) + "]";
+    Info += std::string(getBlockReason()) + "]";
     emitInstrumentation(xpti::trace_barrier_begin, Info.c_str());
 #endif
 
@@ -505,15 +555,21 @@ bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking) {
   // has already been marked as "failed" if enqueueImp throws an exception.
   // This will avoid execution of the same failed command twice.
   MEnqueueStatus = EnqueueResultT::SyclEnqueueFailed;
+  MShouldCompleteEventIfPossible = true;
   cl_int Res = enqueueImp();
 
   if (CL_SUCCESS != Res)
     EnqueueResult =
         EnqueueResultT(EnqueueResultT::SyclEnqueueFailed, this, Res);
-  else
+  else {
+    if (MShouldCompleteEventIfPossible &&
+        (MEvent->is_host() || MEvent->getHandleRef() == nullptr))
+      MEvent->setComplete();
+
     // Consider the command is successfully enqueued if return code is
     // CL_SUCCESS
     MEnqueueStatus = EnqueueResultT::SyclEnqueueSuccess;
+  }
 
   // Emit this correlation signal before the task end
   emitEnqueuedEventSignal(MEvent->getHandleRef());
@@ -564,6 +620,17 @@ void Command::resolveReleaseDependencies(std::set<Command *> &DepList) {
 #endif
 }
 
+const char *Command::getBlockReason() const {
+  switch (MBlockReason) {
+  case BlockReason::HostAccessor:
+    return "A Buffer is locked by the host accessor";
+  case BlockReason::HostTask:
+    return "Blocked by host task";
+  }
+
+  return "Unknown block reason";
+}
+
 AllocaCommandBase::AllocaCommandBase(CommandType Type, QueueImplPtr Queue,
                                      Requirement Req,
                                      AllocaCommandBase *LinkedAllocaCmd)
@@ -617,8 +684,8 @@ void AllocaCommand::emitInstrumentationData() {
 }
 
 cl_int AllocaCommand::enqueueImp() {
-  std::vector<EventImplPtr> EventImpls =
-      Command::prepareEvents(detail::getSyclObjImpl(MQueue->get_context()));
+  waitForPreparedHostEvents();
+  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
 
   RT::PiEvent &Event = MEvent->getHandleRef();
 
@@ -628,6 +695,7 @@ cl_int AllocaCommand::enqueueImp() {
     if (MQueue->is_host()) {
       // Do not need to make allocation if we have a linked device allocation
       Command::waitForEvents(MQueue, EventImpls, Event);
+
       return CL_SUCCESS;
     }
     HostPtr = MLinkedAllocaCmd->getMemAllocation();
@@ -637,6 +705,7 @@ cl_int AllocaCommand::enqueueImp() {
   MMemAllocation = MemoryManager::allocate(
       detail::getSyclObjImpl(MQueue->get_context()), getSYCLMemObj(),
       MInitFromUserData, HostPtr, std::move(EventImpls), Event);
+
   return CL_SUCCESS;
 }
 
@@ -707,8 +776,8 @@ void *AllocaSubBufCommand::getMemAllocation() const {
 }
 
 cl_int AllocaSubBufCommand::enqueueImp() {
-  std::vector<EventImplPtr> EventImpls =
-      Command::prepareEvents(detail::getSyclObjImpl(MQueue->get_context()));
+  waitForPreparedHostEvents();
+  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
   RT::PiEvent &Event = MEvent->getHandleRef();
 
   MMemAllocation = MemoryManager::allocateMemSubBuffer(
@@ -716,6 +785,7 @@ cl_int AllocaSubBufCommand::enqueueImp() {
       MParentAlloca->getMemAllocation(), MRequirement.MElemSize,
       MRequirement.MOffsetInBytes, MRequirement.MAccessRange,
       std::move(EventImpls), Event);
+
   return CL_SUCCESS;
 }
 
@@ -767,8 +837,8 @@ void ReleaseCommand::emitInstrumentationData() {
 }
 
 cl_int ReleaseCommand::enqueueImp() {
-  std::vector<EventImplPtr> EventImpls =
-      Command::prepareEvents(detail::getSyclObjImpl(MQueue->get_context()));
+  waitForPreparedHostEvents();
+  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
   std::vector<RT::PiEvent> RawEvents = getPiEvents(EventImpls);
   bool SkipRelease = false;
 
@@ -878,8 +948,8 @@ void MapMemObject::emitInstrumentationData() {
 }
 
 cl_int MapMemObject::enqueueImp() {
-  std::vector<EventImplPtr> EventImpls =
-      Command::prepareEvents(detail::getSyclObjImpl(MQueue->get_context()));
+  waitForPreparedHostEvents();
+  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
   std::vector<RT::PiEvent> RawEvents = getPiEvents(EventImpls);
 
   RT::PiEvent &Event = MEvent->getHandleRef();
@@ -887,6 +957,7 @@ cl_int MapMemObject::enqueueImp() {
       MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(), MQueue,
       MMapMode, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
       MSrcReq.MOffset, MSrcReq.MElemSize, std::move(RawEvents), Event);
+
   return CL_SUCCESS;
 }
 
@@ -934,14 +1005,15 @@ void UnMapMemObject::emitInstrumentationData() {
 }
 
 cl_int UnMapMemObject::enqueueImp() {
-  std::vector<EventImplPtr> EventImpls =
-      Command::prepareEvents(detail::getSyclObjImpl(MQueue->get_context()));
+  waitForPreparedHostEvents();
+  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
   std::vector<RT::PiEvent> RawEvents = getPiEvents(EventImpls);
 
   RT::PiEvent &Event = MEvent->getHandleRef();
   MemoryManager::unmap(MDstAllocaCmd->getSYCLMemObj(),
                        MDstAllocaCmd->getMemAllocation(), MQueue, *MSrcPtr,
                        std::move(RawEvents), Event);
+
   return CL_SUCCESS;
 }
 
@@ -1000,11 +1072,15 @@ void MemCpyCommand::emitInstrumentationData() {
 #endif
 }
 
+ContextImplPtr MemCpyCommand::getContext() const {
+  const QueueImplPtr &Queue = MQueue->is_host() ? MSrcQueue : MQueue;
+  return detail::getSyclObjImpl(Queue->get_context());
+}
+
 cl_int MemCpyCommand::enqueueImp() {
-  std::vector<EventImplPtr> EventImpls;
   QueueImplPtr Queue = MQueue->is_host() ? MSrcQueue : MQueue;
-  EventImpls =
-      Command::prepareEvents(detail::getSyclObjImpl(Queue->get_context()));
+  waitForPreparedHostEvents();
+  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
 
   RT::PiEvent &Event = MEvent->getHandleRef();
 
@@ -1067,9 +1143,8 @@ void ExecCGCommand::flushStreams() {
 }
 
 cl_int UpdateHostRequirementCommand::enqueueImp() {
-  std::vector<EventImplPtr> EventImpls;
-  EventImpls =
-      Command::prepareEvents(detail::getSyclObjImpl(MQueue->get_context()));
+  waitForPreparedHostEvents();
+  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
   RT::PiEvent &Event = MEvent->getHandleRef();
   Command::waitForEvents(MQueue, EventImpls, Event);
 
@@ -1077,6 +1152,7 @@ cl_int UpdateHostRequirementCommand::enqueueImp() {
   assert(MSrcAllocaCmd->getMemAllocation() && "Expected valid source pointer");
   assert(MDstPtr && "Expected valid target pointer");
   *MDstPtr = MSrcAllocaCmd->getMemAllocation();
+
   return CL_SUCCESS;
 }
 
@@ -1140,10 +1216,15 @@ void MemCpyCommandHost::emitInstrumentationData() {
 #endif
 }
 
+ContextImplPtr MemCpyCommandHost::getContext() const {
+  const QueueImplPtr &Queue = MQueue->is_host() ? MSrcQueue : MQueue;
+  return detail::getSyclObjImpl(Queue->get_context());
+}
+
 cl_int MemCpyCommandHost::enqueueImp() {
   QueueImplPtr Queue = MQueue->is_host() ? MSrcQueue : MQueue;
-  std::vector<EventImplPtr> EventImpls =
-      Command::prepareEvents(detail::getSyclObjImpl(Queue->get_context()));
+  waitForPreparedHostEvents();
+  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
   std::vector<RT::PiEvent> RawEvents = getPiEvents(EventImpls);
 
   RT::PiEvent &Event = MEvent->getHandleRef();
@@ -1153,6 +1234,7 @@ cl_int MemCpyCommandHost::enqueueImp() {
   if (MDstReq.MAccessMode == access::mode::discard_read_write ||
       MDstReq.MAccessMode == access::mode::discard_write) {
     Command::waitForEvents(Queue, EventImpls, Event);
+
     return CL_SUCCESS;
   }
 
@@ -1162,23 +1244,43 @@ cl_int MemCpyCommandHost::enqueueImp() {
       MSrcReq.MOffset, MSrcReq.MElemSize, *MDstPtr, MQueue, MDstReq.MDims,
       MDstReq.MMemoryRange, MDstReq.MAccessRange, MDstReq.MOffset,
       MDstReq.MElemSize, std::move(RawEvents), Event);
+
   return CL_SUCCESS;
 }
 
-EmptyCommand::EmptyCommand(QueueImplPtr Queue, Requirement Req)
-    : Command(CommandType::EMPTY_TASK, std::move(Queue)),
-      MRequirement(std::move(Req)) {
-
+EmptyCommand::EmptyCommand(QueueImplPtr Queue)
+    : Command(CommandType::EMPTY_TASK, std::move(Queue)) {
   emitInstrumentationDataProxy();
 }
 
+cl_int EmptyCommand::enqueueImp() {
+  waitForPreparedHostEvents();
+  waitForEvents(MQueue, MPreparedDepsEvents, MEvent->getHandleRef());
+
+  return CL_SUCCESS;
+}
+
+void EmptyCommand::addRequirement(Command *DepCmd, AllocaCommandBase *AllocaCmd,
+                                  const Requirement *Req) {
+  const Requirement &ReqRef = *Req;
+  MRequirements.emplace_back(ReqRef);
+  const Requirement *const StoredReq = &MRequirements.back();
+
+  addDep(DepDesc{DepCmd, StoredReq, AllocaCmd});
+}
+
 void EmptyCommand::emitInstrumentationData() {
 #ifdef XPTI_ENABLE_INSTRUMENTATION
   if (!xptiTraceEnabled())
     return;
   // Create a payload with the command name and an event using this payload to
   // emit a node_create
-  MAddress = MRequirement.MSYCLMemObj;
+  if (MRequirements.empty())
+    return;
+
+  Requirement &Req = *MRequirements.begin();
+
+  MAddress = Req.MSYCLMemObj;
   makeTraceEventProlog(MAddress);
 
   if (MFirstInstance) {
@@ -1283,6 +1385,9 @@ static std::string cgTypeToString(detail::CG::CGTYPE Type) {
   case detail::CG::PREFETCH_USM:
     return "prefetch usm";
     break;
+  case detail::CG::CODEPLAY_HOST_TASK:
+    return "host task";
+    break;
   default:
     return "unknown";
     break;
@@ -1494,9 +1599,8 @@ void DispatchNativeKernel(void *Blob) {
 }
 
 cl_int ExecCGCommand::enqueueImp() {
-  std::vector<EventImplPtr> EventImpls =
-      Command::prepareEvents(detail::getSyclObjImpl(MQueue->get_context()));
-
+  waitForPreparedHostEvents();
+  std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
   auto RawEvents = getPiEvents(EventImpls);
 
   RT::PiEvent &Event = MEvent->getHandleRef();
@@ -1520,6 +1624,7 @@ cl_int ExecCGCommand::enqueueImp() {
         Scheduler::getInstance().getDefaultHostQueue(), Req->MDims,
         Req->MAccessRange, Req->MAccessRange, /*DstOffset=*/{0, 0, 0},
         Req->MElemSize, std::move(RawEvents), Event);
+
     return CL_SUCCESS;
   }
   case CG::CGTYPE::COPY_PTR_TO_ACC: {
@@ -1553,6 +1658,7 @@ cl_int ExecCGCommand::enqueueImp() {
         ReqSrc->MOffset, ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(),
         MQueue, ReqDst->MDims, ReqDst->MMemoryRange, ReqDst->MAccessRange,
         ReqDst->MOffset, ReqDst->MElemSize, std::move(RawEvents), Event);
+
     return CL_SUCCESS;
   }
   case CG::CGTYPE::FILL: {
@@ -1565,6 +1671,7 @@ cl_int ExecCGCommand::enqueueImp() {
         Fill->MPattern.size(), Fill->MPattern.data(), Req->MDims,
         Req->MMemoryRange, Req->MAccessRange, Req->MOffset, Req->MElemSize,
         std::move(RawEvents), Event);
+
     return CL_SUCCESS;
   }
   case CG::CGTYPE::RUN_ON_HOST_INTEL: {
@@ -1594,6 +1701,7 @@ cl_int ExecCGCommand::enqueueImp() {
         Plugin.call<PiApiKind::piEventsWait>(RawEvents.size(), &RawEvents[0]);
       }
       DispatchNativeKernel((void *)ArgsBlob.data());
+
       return CL_SUCCESS;
     }
 
@@ -1651,6 +1759,7 @@ cl_int ExecCGCommand::enqueueImp() {
       }
       ExecKernel->MHostKernel->call(NDRDesc,
                                     getEvent()->getHostProfilingInfo());
+
       return CL_SUCCESS;
     }
 
@@ -1732,18 +1841,21 @@ cl_int ExecCGCommand::enqueueImp() {
       return detail::enqueue_kernel_launch::handleError(Error, DeviceImpl,
                                                         Kernel, NDRDesc);
     }
+
     return PI_SUCCESS;
   }
   case CG::CGTYPE::COPY_USM: {
     CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
     MemoryManager::copy_usm(Copy->getSrc(), MQueue, Copy->getLength(),
                             Copy->getDst(), std::move(RawEvents), Event);
+
     return CL_SUCCESS;
   }
   case CG::CGTYPE::FILL_USM: {
     CGFillUSM *Fill = (CGFillUSM *)MCommandGroup.get();
     MemoryManager::fill_usm(Fill->getDst(), MQueue, Fill->getLength(),
                             Fill->getFill(), std::move(RawEvents), Event);
+
     return CL_SUCCESS;
   }
   case CG::CGTYPE::PREFETCH_USM: {
@@ -1751,9 +1863,10 @@ cl_int ExecCGCommand::enqueueImp() {
     MemoryManager::prefetch_usm(Prefetch->getDst(), MQueue,
                                 Prefetch->getLength(), std::move(RawEvents),
                                 Event);
+
     return CL_SUCCESS;
   }
-  case CG::CGTYPE::INTEROP_TASK_CODEPLAY: {
+  case CG::CGTYPE::CODEPLAY_INTEROP_TASK: {
     const detail::plugin &Plugin = MQueue->getPlugin();
     CGInteropTask *ExecInterop = (CGInteropTask *)MCommandGroup.get();
     // Wait for dependencies to complete before dispatching work on the host
@@ -1780,6 +1893,31 @@ cl_int ExecCGCommand::enqueueImp() {
                                                 nullptr, &Event);
     Plugin.call<PiApiKind::piQueueRelease>(
         reinterpret_cast<pi_queue>(MQueue->get()));
+
+    return CL_SUCCESS;
+  }
+  case CG::CGTYPE::CODEPLAY_HOST_TASK: {
+    CGHostTask *HostTask = static_cast<CGHostTask *>(MCommandGroup.get());
+
+    for (ArgDesc &Arg : HostTask->MArgs) {
+      switch (Arg.MType) {
+      case kernel_param_kind_t::kind_accessor: {
+        Requirement *Req = static_cast<Requirement *>(Arg.MPtr);
+        AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
+
+        Req->MData = AllocaCmd->getMemAllocation();
+        break;
+      }
+      default:
+        throw runtime_error("Unsupported arg type", PI_INVALID_VALUE);
+      }
+    }
+
+    MQueue->getThreadPool().submit<DispatchHostTask>(
+        std::move(DispatchHostTask(this)));
+
+    MShouldCompleteEventIfPossible = false;
+
     return CL_SUCCESS;
   }
   case CG::CGTYPE::NONE:
diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index 80feb92ce9560..37880f41d1623 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -10,6 +10,7 @@
 
 #include <atomic>
 #include <cstdint>
+#include <deque>
 #include <memory>
 #include <set>
 #include <unordered_set>
@@ -26,6 +27,7 @@ namespace detail {
 class queue_impl;
 class event_impl;
 class context_impl;
+class DispatchHostTask;
 
 using QueueImplPtr = std::shared_ptr<detail::queue_impl>;
 using EventImplPtr = std::shared_ptr<detail::event_impl>;
@@ -35,6 +37,8 @@ class Command;
 class AllocaCommand;
 class AllocaCommandBase;
 class ReleaseCommand;
+class ExecCGCommand;
+class EmptyCommand;
 
 enum BlockingT { NON_BLOCKING = 0, BLOCKING };
 
@@ -95,7 +99,8 @@ class Command {
     MAP_MEM_OBJ,
     UNMAP_MEM_OBJ,
     UPDATE_REQUIREMENT,
-    EMPTY_TASK
+    EMPTY_TASK,
+    HOST_TASK
   };
 
   Command(CommandType Type, QueueImplPtr Queue);
@@ -166,14 +171,34 @@ class Command {
 
   virtual ~Command() = default;
 
+  const char *getBlockReason() const;
+
+  virtual ContextImplPtr getContext() const;
+
 protected:
   EventImplPtr MEvent;
   QueueImplPtr MQueue;
-  std::vector<EventImplPtr> MDepsEvents;
+
+  /// Dependency events prepared for waiting by backend.
+  /// See processDepEvent for details.
+  std::vector<EventImplPtr> MPreparedDepsEvents;
+  std::vector<EventImplPtr> MPreparedHostDepsEvents;
 
   void waitForEvents(QueueImplPtr Queue, std::vector<EventImplPtr> &RawEvents,
                      RT::PiEvent &Event);
-  std::vector<EventImplPtr> prepareEvents(ContextImplPtr Context);
+
+  void waitForPreparedHostEvents() const;
+
+  /// Perform glueing of events from different contexts
+  /// \param DepEvent event this commands should depend on
+  /// \param Dep optional DepDesc to perform connection of events properly
+  ///
+  /// Glueing (i.e. connecting) will be performed if and only if DepEvent is
+  /// not from host context and its context doesn't match to context of this
+  /// command. Context of this command is fetched via getContext().
+  ///
+  /// Optionality of Dep is set by Dep.MDepCommand not equal to nullptr.
+  void processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep);
 
   /// Private interface. Derived classes should implement this method.
   virtual cl_int enqueueImp() = 0;
@@ -183,6 +208,8 @@ class Command {
   /// Mutex used to protect enqueueing from race conditions
   std::mutex MEnqueueMtx;
 
+  friend class DispatchHostTask;
+
 public:
   /// Contains list of dependencies(edges)
   std::vector<DepDesc> MDeps;
@@ -193,7 +220,10 @@ class Command {
   /// Counts the number of memory objects this command is a leaf for.
   unsigned MLeafCounter = 0;
 
-  const char *MBlockReason = "Unknown";
+  enum class BlockReason : int { HostAccessor = 0, HostTask };
+
+  // Only have reasonable value while MIsBlockable is true
+  BlockReason MBlockReason;
 
   /// Describes the status of the command.
   std::atomic<EnqueueResultT::ResultT> MEnqueueStatus;
@@ -224,23 +254,36 @@ class Command {
   bool MFirstInstance = false;
   /// Instance ID tracked for the command.
   uint64_t MInstanceID = 0;
+
+  // This flag allows to control whether host event should be set complete
+  // after successfull enqueue of command. Event is considered as host event if
+  // either it's is_host() return true or there is no backend representation
+  // of event (i.e. getHandleRef() return reference to nullptr value).
+  // By default the flag is set to true due to most of host operations are
+  // synchronous. The only asynchronous operation currently is host-task.
+  bool MShouldCompleteEventIfPossible = true;
 };
 
 /// The empty command does nothing during enqueue. The task can be used to
 /// implement lock in the graph, or to merge several nodes into one.
 class EmptyCommand : public Command {
 public:
-  EmptyCommand(QueueImplPtr Queue, Requirement Req);
+  EmptyCommand(QueueImplPtr Queue);
 
   void printDot(std::ostream &Stream) const final;
-  const Requirement *getRequirement() const final { return &MRequirement; }
+  const Requirement *getRequirement() const final { return &MRequirements[0]; }
+  void addRequirement(Command *DepCmd, AllocaCommandBase *AllocaCmd,
+                      const Requirement *Req);
 
   void emitInstrumentationData();
 
 private:
-  cl_int enqueueImp() final { return CL_SUCCESS; }
+  cl_int enqueueImp() final;
 
-  Requirement MRequirement;
+  // Employing deque here as it allows to push_back/emplace_back without
+  // invalidation of pointer or reference to stored data item regardless of
+  // iterator invalidation.
+  std::deque<Requirement> MRequirements;
 };
 
 /// The release command enqueues release of a memory object instance allocated
@@ -380,6 +423,7 @@ class MemCpyCommand : public Command {
   void printDot(std::ostream &Stream) const final;
   const Requirement *getRequirement() const final { return &MDstReq; }
   void emitInstrumentationData();
+  ContextImplPtr getContext() const override final;
 
 private:
   cl_int enqueueImp() final;
@@ -402,6 +446,7 @@ class MemCpyCommandHost : public Command {
   void printDot(std::ostream &Stream) const final;
   const Requirement *getRequirement() const final { return &MDstReq; }
   void emitInstrumentationData();
+  ContextImplPtr getContext() const override final;
 
 private:
   cl_int enqueueImp() final;
@@ -424,12 +469,22 @@ class ExecCGCommand : public Command {
   void printDot(std::ostream &Stream) const final;
   void emitInstrumentationData();
 
+  detail::CG &getCG() const { return *MCommandGroup; }
+
+  // MEmptyCmd one is only employed if this command refers to host-task.
+  // MEmptyCmd due to unreliable mechanism of lookup for single EmptyCommand
+  // amongst users of host-task-representing command. This unreliability roots
+  // in cleanup process.
+  EmptyCommand *MEmptyCmd = nullptr;
+
 private:
   cl_int enqueueImp() final;
 
   AllocaCommandBase *getAllocaForReq(Requirement *Req);
 
   std::unique_ptr<detail::CG> MCommandGroup;
+
+  friend class Command;
 };
 
 class UpdateHostRequirementCommand : public Command {
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index 4d749d92e9952..684c9069f3ce1 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -129,7 +129,7 @@ MemObjRecord *Scheduler::GraphBuilder::getMemObjRecord(SYCLMemObjI *MemObject) {
 
 MemObjRecord *
 Scheduler::GraphBuilder::getOrInsertMemObjRecord(const QueueImplPtr &Queue,
-                                                 Requirement *Req) {
+                                                 const Requirement *Req) {
   SYCLMemObjI *MemObject = Req->MSYCLMemObj;
   MemObjRecord *Record = getMemObjRecord(MemObject);
 
@@ -416,17 +416,8 @@ Command *Scheduler::GraphBuilder::addHostAccessor(Requirement *Req,
   Command *UpdateHostAccCmd = insertUpdateHostReqCmd(Record, Req, HostQueue);
 
   // Need empty command to be blocked until host accessor is destructed
-  EmptyCommand *EmptyCmd = new EmptyCommand(HostQueue, *Req);
-  EmptyCmd->addDep(
-      DepDesc{UpdateHostAccCmd, EmptyCmd->getRequirement(), HostAllocaCmd});
-  UpdateHostAccCmd->addUser(EmptyCmd);
-
-  EmptyCmd->MIsBlockable = true;
-  EmptyCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueBlocked;
-  EmptyCmd->MBlockReason = "A Buffer is locked by the host accessor";
-
-  updateLeaves({UpdateHostAccCmd}, Record, Req->MAccessMode);
-  addNodeToLeaves(Record, EmptyCmd, Req->MAccessMode);
+  EmptyCommand *EmptyCmd = addEmptyCmd<Requirement>(
+      UpdateHostAccCmd, {Req}, HostQueue, Command::BlockReason::HostAccessor);
 
   Req->MBlockedCmd = EmptyCmd;
 
@@ -455,7 +446,8 @@ Command *Scheduler::GraphBuilder::addCGUpdateHost(
 /// 2. New and examined commands has non-overlapping requirements -> can bypass
 /// 3. New and examined commands have different contexts -> cannot bypass
 std::set<Command *>
-Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record, Requirement *Req,
+Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record,
+                                        const Requirement *Req,
                                         const ContextImplPtr &Context) {
   std::set<Command *> RetDeps;
   std::set<Command *> Visited;
@@ -522,8 +514,10 @@ DepDesc Scheduler::GraphBuilder::findDepForRecord(Command *Cmd,
 
 // The function searches for the alloca command matching context and
 // requirement.
-AllocaCommandBase *Scheduler::GraphBuilder::findAllocaForReq(
-    MemObjRecord *Record, Requirement *Req, const ContextImplPtr &Context) {
+AllocaCommandBase *
+Scheduler::GraphBuilder::findAllocaForReq(MemObjRecord *Record,
+                                          const Requirement *Req,
+                                          const ContextImplPtr &Context) {
   auto IsSuitableAlloca = [&Context, Req](AllocaCommandBase *AllocaCmd) {
     bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(), Context);
     if (IsSuitableSubReq(Req)) {
@@ -544,7 +538,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::findAllocaForReq(
 // Note, creation of new allocation command can lead to the current context
 // (Record->MCurContext) change.
 AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
-    MemObjRecord *Record, Requirement *Req, QueueImplPtr Queue) {
+    MemObjRecord *Record, const Requirement *Req, QueueImplPtr Queue) {
 
   AllocaCommandBase *AllocaCmd =
       findAllocaForReq(Record, Req, Queue->getContextImplPtr());
@@ -614,6 +608,15 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
         } else {
           LinkedAllocaCmd->MIsActive = false;
           Record->MCurContext = Queue->getContextImplPtr();
+
+          std::set<Command *> Deps =
+              findDepsForReq(Record, Req, Queue->getContextImplPtr());
+          for (Command *Dep : Deps) {
+            AllocaCmd->addDep(DepDesc{Dep, Req, LinkedAllocaCmd});
+            Dep->addUser(AllocaCmd);
+          }
+          updateLeaves(Deps, Record, Req->MAccessMode);
+          addNodeToLeaves(Record, AllocaCmd, Req->MAccessMode);
         }
       }
     }
@@ -640,11 +643,50 @@ void Scheduler::GraphBuilder::markModifiedIfWrite(MemObjRecord *Record,
   }
 }
 
+template <typename T>
+typename std::enable_if<
+    std::is_same<typename std::remove_cv<T>::type, Requirement>::value,
+    EmptyCommand *>::type
+Scheduler::GraphBuilder::addEmptyCmd(Command *Cmd, const std::vector<T *> &Reqs,
+                                     const QueueImplPtr &Queue,
+                                     Command::BlockReason Reason) {
+  EmptyCommand *EmptyCmd =
+      new EmptyCommand(Scheduler::getInstance().getDefaultHostQueue());
+
+  if (!EmptyCmd)
+    throw runtime_error("Out of host memory", PI_OUT_OF_HOST_MEMORY);
+
+  EmptyCmd->MIsBlockable = true;
+  EmptyCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueBlocked;
+  EmptyCmd->MBlockReason = Reason;
+
+  for (T *Req : Reqs) {
+    MemObjRecord *Record = getOrInsertMemObjRecord(Queue, Req);
+    AllocaCommandBase *AllocaCmd = getOrCreateAllocaForReq(Record, Req, Queue);
+    EmptyCmd->addRequirement(Cmd, AllocaCmd, Req);
+  }
+
+  Cmd->addUser(EmptyCmd);
+
+  const std::vector<DepDesc> &Deps = Cmd->MDeps;
+  for (const DepDesc &Dep : Deps) {
+    const Requirement *Req = Dep.MDepRequirement;
+    MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj);
+
+    updateLeaves({Cmd}, Record, Req->MAccessMode);
+    addNodeToLeaves(Record, EmptyCmd, Req->MAccessMode);
+  }
+
+  return EmptyCmd;
+}
+
 Command *
 Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
                                QueueImplPtr Queue) {
   const std::vector<Requirement *> &Reqs = CommandGroup->MRequirements;
   const std::vector<detail::EventImplPtr> &Events = CommandGroup->MEvents;
+  const CG::CGTYPE CGType = CommandGroup->getType();
+
   std::unique_ptr<ExecCGCommand> NewCmd(
       new ExecCGCommand(std::move(CommandGroup), Queue));
   if (!NewCmd)
@@ -684,6 +726,7 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
   // Set new command as user for dependencies and update leaves.
   // Node dependencies can be modified further when adding the node to leaves,
   // iterate over their copy.
+  // FIXME employ a reference here to eliminate copying of a vector
   std::vector<DepDesc> Deps = NewCmd->MDeps;
   for (DepDesc &Dep : Deps) {
     Dep.MDepCommand->addUser(NewCmd.get());
@@ -698,8 +741,13 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
     NewCmd->addDep(e);
   }
 
+  if (CGType == CG::CGTYPE::CODEPLAY_HOST_TASK)
+    NewCmd->MEmptyCmd = addEmptyCmd(NewCmd.get(), NewCmd->getCG().MRequirements,
+                                    Queue, Command::BlockReason::HostTask);
+
   if (MPrintOptionsArray[AfterAddCG])
     printGraphAsDot("after_addCG");
+
   return NewCmd.release();
 }
 
@@ -725,14 +773,35 @@ void Scheduler::GraphBuilder::cleanupCommandsForRecord(MemObjRecord *Record) {
   // Dependencies of the users will be cleaned up during the traversal
   for (Command *AllocaCmd : AllocaCommands) {
     Visited.insert(AllocaCmd);
+
     for (Command *UserCmd : AllocaCmd->MUsers)
-      ToVisit.push(UserCmd);
+      // Linked alloca cmd may be in users of this alloca. We're not going to
+      // visit it.
+      if (UserCmd->getType() != Command::CommandType::ALLOCA)
+        ToVisit.push(UserCmd);
+      else
+        Visited.insert(UserCmd);
+
     CmdsToDelete.push_back(AllocaCmd);
     // These commands will be deleted later, clear users now to avoid
     // updating them during edge removal
     AllocaCmd->MUsers.clear();
   }
 
+  // Linked alloca's share dependencies. Unchain from deps linked alloca's.
+  // Any cmd of the alloca - linked_alloca may be used later on.
+  for (AllocaCommandBase *AllocaCmd : AllocaCommands) {
+    AllocaCommandBase *LinkedCmd = AllocaCmd->MLinkedAllocaCmd;
+
+    if (LinkedCmd) {
+      assert(Visited.count(LinkedCmd));
+
+      for (DepDesc &Dep : AllocaCmd->MDeps)
+        if (Dep.MDepCommand)
+          Dep.MDepCommand->MUsers.erase(AllocaCmd);
+    }
+  }
+
   // Traverse the graph using BFS
   while (!ToVisit.empty()) {
     Command *Cmd = ToVisit.front();
@@ -742,7 +811,8 @@ void Scheduler::GraphBuilder::cleanupCommandsForRecord(MemObjRecord *Record) {
       continue;
 
     for (Command *UserCmd : Cmd->MUsers)
-      ToVisit.push(UserCmd);
+      if (UserCmd->getType() != Command::CommandType::ALLOCA)
+        ToVisit.push(UserCmd);
 
     // Delete all dependencies on any allocations being removed
     // Track which commands should have their users updated
@@ -820,6 +890,7 @@ void Scheduler::GraphBuilder::cleanupFinishedCommands(Command *FinishedCmd) {
       DepCmd->MUsers.erase(Cmd);
     }
     Cmd->getEvent()->setCommand(nullptr);
+
     delete Cmd;
   }
 }
@@ -833,6 +904,104 @@ void Scheduler::GraphBuilder::removeRecordForMemObj(SYCLMemObjI *MemObject) {
   MemObject->MRecord.reset();
 }
 
+// Make Cmd depend on DepEvent from different context. Connection is performed
+// via distinct ConnectCmd with host task command group on host queue. Cmd will
+// depend on ConnectCmd's host event.
+// DepEvent may not have a command associated with it in at least two cases:
+//  - the command was deleted upon cleanup process;
+//  - DepEvent is user event.
+// In both of these cases the only thing we can do is to make ConnectCmd depend
+// on DepEvent.
+// Otherwise, when there is a command associated with DepEvent, we make
+// ConnectCmd depend on on this command. If there is valid, i.e. non-nil,
+// requirement in Dep we make ConnectCmd depend on DepEvent's command with this
+// requirement.
+// Optionality of Dep is set by Dep.MDepCommand equal to nullptr.
+void Scheduler::GraphBuilder::connectDepEvent(Command *const Cmd,
+                                              EventImplPtr DepEvent,
+                                              const DepDesc &Dep) {
+  const ContextImplPtr &Context = Cmd->getContext();
+  const ContextImplPtr &DepEventContext = DepEvent->getContextImpl();
+
+  assert(Context != DepEventContext);
+
+  // construct Host Task type command manually and make it depend on DepEvent
+  ExecCGCommand *ConnectCmd = nullptr;
+
+  {
+    std::unique_ptr<detail::HostTask> HT(new detail::HostTask);
+    std::unique_ptr<detail::CG> ConnectCG(new detail::CGHostTask(
+        std::move(HT), /* Args = */ {}, /* ArgsStorage = */ {},
+        /* AccStorage = */ {}, /* SharedPtrStorage = */ {},
+        /* Requirements = */ {}, /* DepEvents = */ {DepEvent},
+        CG::CODEPLAY_HOST_TASK, /* Payload */ {}));
+    ConnectCmd = new ExecCGCommand(
+        std::move(ConnectCG), Scheduler::getInstance().getDefaultHostQueue());
+  }
+
+  if (!ConnectCmd)
+    throw runtime_error("Out of host memory", PI_OUT_OF_HOST_MEMORY);
+
+  if (Command *DepCmd = reinterpret_cast<Command *>(DepEvent->getCommand()))
+    DepCmd->addUser(ConnectCmd);
+
+  EmptyCommand *EmptyCmd = nullptr;
+
+  if (Dep.MDepRequirement) {
+    // make ConnectCmd depend on requirement
+    ConnectCmd->addDep(Dep);
+    assert(reinterpret_cast<Command *>(DepEvent->getCommand()) ==
+           Dep.MDepCommand);
+    // add user to Dep.MDepCommand is already performed beyond this if branch
+
+    MemObjRecord *Record = getMemObjRecord(Dep.MDepRequirement->MSYCLMemObj);
+
+    updateLeaves({Dep.MDepCommand}, Record, Dep.MDepRequirement->MAccessMode);
+    addNodeToLeaves(Record, ConnectCmd, Dep.MDepRequirement->MAccessMode);
+
+    const std::vector<const Requirement *> Reqs(1, Dep.MDepRequirement);
+    EmptyCmd = addEmptyCmd(ConnectCmd, Reqs,
+                           Scheduler::getInstance().getDefaultHostQueue(),
+                           Command::BlockReason::HostTask);
+    // Dependencies for EmptyCmd are set in addEmptyCmd for provided Reqs.
+
+    // Depend Cmd on empty command
+    {
+      DepDesc CmdDep = Dep;
+      CmdDep.MDepCommand = EmptyCmd;
+
+      Cmd->addDep(CmdDep);
+    }
+  } else {
+    EmptyCmd = addEmptyCmd<Requirement>(
+        ConnectCmd, {}, Scheduler::getInstance().getDefaultHostQueue(),
+        Command::BlockReason::HostTask);
+
+    // There is no requirement thus, empty command will only depend on
+    // ConnectCmd via its event.
+    EmptyCmd->addDep(ConnectCmd->getEvent());
+    ConnectCmd->addDep(DepEvent);
+
+    // Depend Cmd on empty command
+    Cmd->addDep(EmptyCmd->getEvent());
+  }
+
+  EmptyCmd->addUser(Cmd);
+
+  ConnectCmd->MEmptyCmd = EmptyCmd;
+
+  // FIXME graph builder shouldn't really enqueue commands. We're in the middle
+  // of enqueue process for some command Cmd. We're going to add a dependency
+  // for it. Need some nice and cute solution to enqueue ConnectCmd via standard
+  // scheduler/graph processor mechanisms.
+  // Though, we need this call to enqueue to launch ConnectCmd.
+  EnqueueResultT Res;
+  bool Enqueued = Scheduler::GraphProcessor::enqueueCommand(ConnectCmd, Res);
+  if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
+    throw runtime_error("Failed to enqueue a sync event between two contexts",
+                        PI_INVALID_OPERATION);
+}
+
 } // namespace detail
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/source/detail/scheduler/graph_processor.cpp b/sycl/source/detail/scheduler/graph_processor.cpp
index 08c8a67431376..aa72388a70a12 100644
--- a/sycl/source/detail/scheduler/graph_processor.cpp
+++ b/sycl/source/detail/scheduler/graph_processor.cpp
@@ -45,11 +45,7 @@ void Scheduler::GraphProcessor::waitForEvent(EventImplPtr Event) {
     // TODO: Reschedule commands.
     throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION);
 
-  RT::PiEvent &CLEvent = Cmd->getEvent()->getHandleRef();
-  if (CLEvent) {
-    const detail::plugin &Plugin = Event->getPlugin();
-    Plugin.call<PiApiKind::piEventsWait>(1, &CLEvent);
-  }
+  Cmd->getEvent()->waitInternal();
 }
 
 bool Scheduler::GraphProcessor::enqueueCommand(Command *Cmd,
diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp
index 9c16eafb26ace..2cac4499e16d8 100644
--- a/sycl/source/detail/scheduler/scheduler.cpp
+++ b/sycl/source/detail/scheduler/scheduler.cpp
@@ -73,9 +73,16 @@ EventImplPtr Scheduler::addCG(std::unique_ptr<detail::CG> CommandGroup,
       NewCmd = MGraphBuilder.addCGUpdateHost(std::move(CommandGroup),
                                              DefaultHostQueue);
       break;
+    case CG::CODEPLAY_HOST_TASK:
+      NewCmd = MGraphBuilder.addCG(std::move(CommandGroup), DefaultHostQueue);
+      break;
     default:
       NewCmd = MGraphBuilder.addCG(std::move(CommandGroup), std::move(Queue));
     }
+  }
+
+  {
+    std::shared_lock<std::shared_timed_mutex> Lock(MGraphLock);
 
     // TODO: Check if lazy mode.
     EnqueueResultT Res;
@@ -153,6 +160,7 @@ void Scheduler::removeMemoryObject(detail::SYCLMemObjI *MemObj) {
   if (!Record)
     // No operations were performed on the mem object
     return;
+
   waitForRecordToFinish(Record);
   MGraphBuilder.decrementLeafCountersForRecord(Record);
   MGraphBuilder.cleanupCommandsForRecord(Record);
@@ -175,8 +183,19 @@ EventImplPtr Scheduler::addHostAccessor(Requirement *Req,
 }
 
 void Scheduler::releaseHostAccessor(Requirement *Req) {
-  Req->MBlockedCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueReady;
+  Command *const BlockedCmd = Req->MBlockedCmd;
+
   std::shared_lock<std::shared_timed_mutex> Lock(MGraphLock);
+
+  assert(BlockedCmd && "Can't find appropriate command to unblock");
+
+  BlockedCmd->MEnqueueStatus = EnqueueResultT::SyclEnqueueReady;
+
+  enqueueLeavesOfReqUnlocked(Req);
+}
+
+// static
+void Scheduler::enqueueLeavesOfReqUnlocked(const Requirement *const Req) {
   MemObjRecord *Record = Req->MSYCLMemObj->MRecord.get();
   auto EnqueueLeaves = [](CircularBuffer<Command *> &Leaves) {
     for (Command *Cmd : Leaves) {
diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp
index a86755a8406e8..b9bb2e8ef02b9 100644
--- a/sycl/source/detail/scheduler/scheduler.hpp
+++ b/sycl/source/detail/scheduler/scheduler.hpp
@@ -17,6 +17,7 @@
 #include <memory>
 #include <set>
 #include <shared_mutex>
+#include <unordered_set>
 #include <vector>
 
 /// \defgroup sycl_graph DPC++ Execution Graph
@@ -172,6 +173,7 @@ namespace detail {
 class queue_impl;
 class event_impl;
 class context_impl;
+class DispatchHostTask;
 
 using QueueImplPtr = std::shared_ptr<detail::queue_impl>;
 using EventImplPtr = std::shared_ptr<detail::event_impl>;
@@ -428,6 +430,8 @@ class Scheduler {
   Scheduler();
   static Scheduler instance;
 
+  static void enqueueLeavesOfReqUnlocked(const Requirement *const Req);
+
   /// Graph builder class.
   ///
   /// The graph builder provides means to change an existing graph (e.g. add
@@ -489,7 +493,7 @@ class Scheduler {
     /// \return a pointer to MemObjRecord for pointer to memory object. If the
     /// record is not found, nullptr is returned.
     MemObjRecord *getOrInsertMemObjRecord(const QueueImplPtr &Queue,
-                                          Requirement *Req);
+                                          const Requirement *Req);
 
     /// Decrements leaf counters for all leaves of the record.
     void decrementLeafCountersForRecord(MemObjRecord *Record);
@@ -508,6 +512,15 @@ class Scheduler {
     void updateLeaves(const std::set<Command *> &Cmds, MemObjRecord *Record,
                       access::mode AccessMode);
 
+    /// Perform connection of events in multiple contexts
+    /// \param Cmd dependant command
+    /// \param DepEvent event to depend on
+    /// \param Dep optional DepDesc to perform connection properly
+    ///
+    /// Optionality of Dep is set by Dep.MDepCommand equal to nullptr.
+    void connectDepEvent(Command *const Cmd, EventImplPtr DepEvent,
+                         const DepDesc &Dep);
+
     std::vector<SYCLMemObjI *> MMemObjs;
 
   private:
@@ -533,21 +546,34 @@ class Scheduler {
                            const QueueImplPtr &Queue);
 
     /// Finds dependencies for the requirement.
-    std::set<Command *> findDepsForReq(MemObjRecord *Record, Requirement *Req,
+    std::set<Command *> findDepsForReq(MemObjRecord *Record,
+                                       const Requirement *Req,
                                        const ContextImplPtr &Context);
 
+    template <typename T>
+    typename std::enable_if<
+        std::is_same<typename std::remove_cv<T>::type, Requirement>::value,
+        EmptyCommand *>::type
+    addEmptyCmd(Command *Cmd, const std::vector<T *> &Req,
+                const QueueImplPtr &Queue, Command::BlockReason Reason);
+
+  protected:
     /// Finds a command dependency corresponding to the record.
     DepDesc findDepForRecord(Command *Cmd, MemObjRecord *Record);
 
     /// Searches for suitable alloca in memory record.
-    AllocaCommandBase *findAllocaForReq(MemObjRecord *Record, Requirement *Req,
+    AllocaCommandBase *findAllocaForReq(MemObjRecord *Record,
+                                        const Requirement *Req,
                                         const ContextImplPtr &Context);
 
+    friend class Command;
+
+  private:
     /// Searches for suitable alloca in memory record.
     ///
     /// If none found, creates new one.
     AllocaCommandBase *getOrCreateAllocaForReq(MemObjRecord *Record,
-                                               Requirement *Req,
+                                               const Requirement *Req,
                                                QueueImplPtr Queue);
 
     void markModifiedIfWrite(MemObjRecord *Record, Requirement *Req);
@@ -664,6 +690,9 @@ class Scheduler {
   std::shared_timed_mutex MGraphLock;
 
   QueueImplPtr DefaultHostQueue;
+
+  friend class Command;
+  friend class DispatchHostTask;
 };
 
 } // namespace detail
diff --git a/sycl/source/detail/thread_pool.hpp b/sycl/source/detail/thread_pool.hpp
new file mode 100644
index 0000000000000..2d7873748bcc9
--- /dev/null
+++ b/sycl/source/detail/thread_pool.hpp
@@ -0,0 +1,99 @@
+//===-- thread_pool.hpp - Simple thread pool --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <algorithm>
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <vector>
+
+#include <CL/sycl/detail/defines.hpp>
+
+__SYCL_INLINE_NAMESPACE(cl) {
+namespace sycl {
+namespace detail {
+
+class ThreadPool {
+  std::vector<std::thread> MLaunchedThreads;
+
+  size_t MThreadCount;
+  std::queue<std::function<void()>> MJobQueue;
+  std::mutex MJobQueueMutex;
+  std::condition_variable MDoSmthOrStop;
+  std::atomic_bool MStop;
+
+  void worker() {
+    std::unique_lock<std::mutex> Lock(MJobQueueMutex);
+
+    while (true) {
+      MDoSmthOrStop.wait(
+          Lock, [this]() { return !MJobQueue.empty() || MStop.load(); });
+
+      if (MStop.load())
+        break;
+
+      std::function<void()> Job = std::move(MJobQueue.front());
+      MJobQueue.pop();
+      Lock.unlock();
+
+      Job();
+
+      Lock.lock();
+    }
+  }
+
+public:
+  ThreadPool(unsigned int ThreadCount = 1) : MThreadCount(ThreadCount) {}
+
+  ~ThreadPool() { finishAndWait(); }
+
+  void start() {
+    MLaunchedThreads.reserve(MThreadCount);
+
+    MStop.store(false);
+
+    for (size_t Idx = 0; Idx < MThreadCount; ++Idx)
+      MLaunchedThreads.emplace_back([this] { worker(); });
+  }
+
+  void finishAndWait() {
+    MStop.store(true);
+
+    MDoSmthOrStop.notify_all();
+
+    for (std::thread &Thread : MLaunchedThreads)
+      if (Thread.joinable())
+        Thread.join();
+  }
+
+  template <typename T> void submit(T &&Func) {
+    {
+      std::lock_guard<std::mutex> Lock(MJobQueueMutex);
+      MJobQueue.emplace([F = std::move(Func)]() { F(); });
+    }
+
+    MDoSmthOrStop.notify_one();
+  }
+
+  void submit(std::function<void()> &&Func) {
+    {
+      std::lock_guard<std::mutex> Lock(MJobQueueMutex);
+      MJobQueue.emplace(Func);
+    }
+
+    MDoSmthOrStop.notify_one();
+  }
+};
+
+} // namespace detail
+} // namespace sycl
+} // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index 6d018f6d3b122..195cc9c1fc072 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -39,7 +39,7 @@ event handler::finalize() {
         MCodeLoc));
     break;
   }
-  case detail::CG::INTEROP_TASK_CODEPLAY:
+  case detail::CG::CODEPLAY_INTEROP_TASK:
     CommandGroup.reset(new detail::CGInteropTask(
         std::move(MInteropTask), std::move(MArgsStorage),
         std::move(MAccStorage), std::move(MSharedPtrStorage),
@@ -83,6 +83,12 @@ event handler::finalize() {
         std::move(MSharedPtrStorage), std::move(MRequirements),
         std::move(MEvents), MCodeLoc));
     break;
+  case detail::CG::CODEPLAY_HOST_TASK:
+    CommandGroup.reset(new detail::CGHostTask(
+        std::move(MHostTask), std::move(MArgs), std::move(MArgsStorage),
+        std::move(MAccStorage), std::move(MSharedPtrStorage),
+        std::move(MRequirements), std::move(MEvents), MCGType, MCodeLoc));
+    break;
   case detail::CG::NONE:
     throw runtime_error("Command group submitted without a kernel or a "
                         "explicit memory operation.",
diff --git a/sycl/test/CMakeLists.txt b/sycl/test/CMakeLists.txt
index bdb730b2c2152..ffbbbe6e38be3 100644
--- a/sycl/test/CMakeLists.txt
+++ b/sycl/test/CMakeLists.txt
@@ -10,6 +10,9 @@ set(LLVM_BUILD_LIBRARY_DIRS "${LLVM_BINARY_DIR}/lib/")
 set(RT_TEST_ARGS ${RT_TEST_ARGS} "-v")
 set(DEPLOY_RT_TEST_ARGS ${DEPLOY_RT_TEST_ARGS} "-v -D SYCL_TOOLS_DIR=${CMAKE_INSTALL_PREFIX}/bin -D SYCL_LIBS_DIR=${CMAKE_INSTALL_PREFIX}/lib${LLVM_LIBDIR_SUFFIX} -D SYCL_INCLUDE=${SYCL_INCLUDE_DEPLOY_DIR}")
 
+find_package(Threads REQUIRED)
+set(SYCL_THREADS_LIB ${CMAKE_THREAD_LIBS_INIT})
+
 configure_lit_site_cfg(
   ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
   ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
diff --git a/sycl/test/abi/layout_handler.cpp b/sycl/test/abi/layout_handler.cpp
index 49887124183a4..65ad8670db1fc 100644
--- a/sycl/test/abi/layout_handler.cpp
+++ b/sycl/test/abi/layout_handler.cpp
@@ -31,6 +31,7 @@
 // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 0
 // CHECK-NEXT: FieldDecl {{.*}} MPattern 'vector_class<char>':'std::vector<char, std::allocator<char>>'
 // CHECK-NEXT: FieldDecl {{.*}} MHostKernel 'unique_ptr_class<detail::HostKernelBase>':'std::unique_ptr<cl::sycl::detail::HostKernelBase, std::default_delete<cl::sycl::detail::HostKernelBase>>'
+// CHECK-NEXT: FieldDecl {{.*}} MHostTask 'unique_ptr_class<detail::HostTask>':'std::unique_ptr<cl::sycl::detail::HostTask, std::default_delete<cl::sycl::detail::HostTask>>'
 // CHECK-NEXT: FieldDecl {{.*}} MOSModuleHandle 'detail::OSModuleHandle':'long'
 // CHECK-NEXT: FieldDecl {{.*}} MInteropTask 'std::unique_ptr<detail::InteropTask>':'std::unique_ptr<cl::sycl::detail::InteropTask, std::default_delete<cl::sycl::detail::InteropTask>>'
 // CHECK-NEXT: FieldDecl {{.*}} MEvents 'vector_class<detail::EventImplPtr>':'std::vector<std::shared_ptr<cl::sycl::detail::event_impl>, std::allocator<std::shared_ptr<cl::sycl::detail::event_impl>>>'
diff --git a/sycl/test/abi/symbol_size.cpp b/sycl/test/abi/symbol_size.cpp
index 5b595a86978ea..9380fd1526de1 100644
--- a/sycl/test/abi/symbol_size.cpp
+++ b/sycl/test/abi/symbol_size.cpp
@@ -44,9 +44,9 @@ int main() {
   check_size<event, 16>();
   check_size<gpu_selector, 8>();
 #ifdef _MSC_VER
-  check_size<handler, 520>();
-#else
   check_size<handler, 528>();
+#else
+  check_size<handler, 536>();
 #endif
   check_size<image<1>, 16>();
   check_size<kernel, 16>();
diff --git a/sycl/test/host-interop-task/host-task-dependency.cpp b/sycl/test/host-interop-task/host-task-dependency.cpp
new file mode 100644
index 0000000000000..942b79165aebd
--- /dev/null
+++ b/sycl/test/host-interop-task/host-task-dependency.cpp
@@ -0,0 +1,196 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out %threads_lib
+// RUN: %CPU_RUN_PLACEHOLDER SYCL_PI_TRACE=-1 %t.out 2>&1 %CPU_CHECK_PLACEHOLDER
+// RUN: %GPU_RUN_PLACEHOLDER SYCL_PI_TRACE=-1 %t.out 2>&1 %GPU_CHECK_PLACEHOLDER
+// RUN: %ACC_RUN_PLACEHOLDER SYCL_PI_TRACE=-1 %t.out 2>&1 %ACC_CHECK_PLACEHOLDER
+
+#include <atomic>
+#include <condition_variable>
+#include <future>
+#include <mutex>
+#include <thread>
+
+#include <CL/sycl.hpp>
+
+namespace S = cl::sycl;
+
+struct Context {
+  std::atomic_bool Flag;
+  S::queue &Queue;
+  S::buffer<int, 1> Buf1;
+  S::buffer<int, 1> Buf2;
+  S::buffer<int, 1> Buf3;
+  std::mutex Mutex;
+  std::condition_variable CV;
+};
+
+void Thread1Fn(Context *Ctx) {
+  // 0. initialize resulting buffer with apriori wrong result
+  {
+    S::accessor<int, 1, S::access::mode::write,
+                S::access::target::host_buffer>
+        Acc(Ctx->Buf1);
+
+    for (size_t Idx = 0; Idx < Acc.get_count(); ++Idx)
+      Acc[Idx] = -1;
+  }
+
+  {
+    S::accessor<int, 1, S::access::mode::write,
+                S::access::target::host_buffer>
+        Acc(Ctx->Buf2);
+
+    for (size_t Idx = 0; Idx < Acc.get_count(); ++Idx)
+      Acc[Idx] = -2;
+  }
+
+  {
+    S::accessor<int, 1, S::access::mode::write,
+                S::access::target::host_buffer>
+        Acc(Ctx->Buf3);
+
+    for (size_t Idx = 0; Idx < Acc.get_count(); ++Idx)
+      Acc[Idx] = -3;
+  }
+
+  // 1. submit task writing to buffer 1
+  Ctx->Queue.submit([&](S::handler &CGH) {
+    S::accessor<int, 1, S::access::mode::write,
+                S::access::target::global_buffer>
+        GeneratorAcc(Ctx->Buf1, CGH);
+
+    auto GeneratorKernel = [GeneratorAcc] {
+      for (size_t Idx = 0; Idx < GeneratorAcc.get_count(); ++Idx)
+        GeneratorAcc[Idx] = Idx;
+    };
+
+    CGH.single_task<class GeneratorTask>(GeneratorKernel);
+  });
+
+  // 2. submit host task writing from buf 1 to buf 2
+  auto HostTaskEvent = Ctx->Queue.submit([&](S::handler &CGH) {
+    S::accessor<int, 1, S::access::mode::read,
+                S::access::target::host_buffer>
+        CopierSrcAcc(Ctx->Buf1, CGH);
+    S::accessor<int, 1, S::access::mode::write,
+                S::access::target::host_buffer>
+        CopierDstAcc(Ctx->Buf2, CGH);
+
+    auto CopierHostTask = [CopierSrcAcc, CopierDstAcc, &Ctx] {
+      for (size_t Idx = 0; Idx < CopierDstAcc.get_count(); ++Idx)
+        CopierDstAcc[Idx] = CopierSrcAcc[Idx];
+
+      bool Expected = false;
+      bool Desired = true;
+      assert(Ctx->Flag.compare_exchange_strong(Expected, Desired));
+
+      {
+        std::lock_guard<std::mutex> Lock(Ctx->Mutex);
+        Ctx->CV.notify_all();
+      }
+    };
+
+    CGH.codeplay_host_task(CopierHostTask);
+  });
+
+  // 3. submit simple task to move data between two buffers
+  Ctx->Queue.submit([&](S::handler &CGH) {
+    S::accessor<int, 1, S::access::mode::read,
+                S::access::target::global_buffer>
+        SrcAcc(Ctx->Buf2, CGH);
+    S::accessor<int, 1, S::access::mode::write,
+                S::access::target::global_buffer>
+        DstAcc(Ctx->Buf3, CGH);
+
+    CGH.depends_on(HostTaskEvent);
+
+    auto CopierKernel = [SrcAcc, DstAcc] {
+      for (size_t Idx = 0; Idx < DstAcc.get_count(); ++Idx)
+        DstAcc[Idx] = SrcAcc[Idx];
+    };
+
+    CGH.single_task<class CopierTask>(CopierKernel);
+  });
+
+  // 4. check data in buffer #3
+  {
+    S::accessor<int, 1, S::access::mode::read,
+                S::access::target::host_buffer>
+        Acc(Ctx->Buf3);
+
+    bool Failure = false;
+
+    for (size_t Idx = 0; Idx < Acc.get_count(); ++Idx) {
+      fprintf(stderr, "Third buffer [%3zu] = %i\n", Idx, Acc[Idx]);
+
+      Failure |= (Acc[Idx] != Idx);
+    }
+
+    assert(!Failure && "Invalid data in third buffer");
+  }
+}
+
+void Thread2Fn(Context *Ctx) {
+  std::unique_lock<std::mutex> Lock(Ctx->Mutex);
+
+  // T2.1. Wait until flag F is set eq true.
+  Ctx->CV.wait(Lock, [Ctx] { return Ctx->Flag.load(); });
+
+  assert(Ctx->Flag.load());
+}
+
+void test() {
+  auto EH = [](S::exception_list EL) {
+    for (const std::exception_ptr &E : EL) {
+      throw E;
+    }
+  };
+
+  S::queue Queue(EH);
+
+  Context Ctx{{false}, Queue, {10}, {10}, {10}, {}, {}};
+
+  // 0. setup: thread 1 T1: exec smth; thread 2 T2: waits; init flag F = false
+  auto A1 = std::async(std::launch::async, Thread1Fn, &Ctx);
+  auto A2 = std::async(std::launch::async, Thread2Fn, &Ctx);
+
+  A1.get();
+  A2.get();
+
+  assert(Ctx.Flag.load());
+
+  // 3. check via host accessor that buf 2 contains valid data
+  {
+    S::accessor<int, 1, S::access::mode::read,
+                S::access::target::host_buffer>
+        ResultAcc(Ctx.Buf2);
+
+    bool Failure = false;
+    for (size_t Idx = 0; Idx < ResultAcc.get_count(); ++Idx) {
+      fprintf(stderr, "Second buffer [%3zu] = %i\n", Idx, ResultAcc[Idx]);
+
+      Failure |= (ResultAcc[Idx] != Idx);
+    }
+
+    assert(!Failure && "Invalid data in result buffer");
+  }
+}
+
+int main() {
+  test();
+
+  return 0;
+}
+
+// launch of GeneratorTask kernel
+// CHECK:---> piKernelCreate(
+// CHECK: GeneratorTask
+// CHECK:---> piEnqueueKernelLaunch(
+// prepare for host task
+// CHECK:---> piEnqueueMemBufferMap(
+// launch of CopierTask kernel
+// CHECK:---> piKernelCreate(
+// CHECK: CopierTask
+// CHECK:---> piEnqueueKernelLaunch(
+// TODO need to check for piEventsWait as "wait on dependencies of host task".
+// At the same time this piEventsWait may occur anywhere after
+// piEnqueueMemBufferMap ("prepare for host task").
diff --git a/sycl/test/host-interop-task/host-task-two-queues.cpp b/sycl/test/host-interop-task/host-task-two-queues.cpp
new file mode 100644
index 0000000000000..55134149be36b
--- /dev/null
+++ b/sycl/test/host-interop-task/host-task-two-queues.cpp
@@ -0,0 +1,78 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <vector>
+
+namespace S = cl::sycl;
+
+#define WIDTH 5
+#define HEIGHT 5
+
+void test() {
+  auto EH = [](S::exception_list EL) {
+    for (const std::exception_ptr &E : EL) {
+      throw E;
+    }
+  };
+
+  S::queue Q1(EH);
+  S::queue Q2(EH);
+
+  std::vector<int> DataA(WIDTH * HEIGHT, 2);
+  std::vector<int> DataB(WIDTH * HEIGHT, 3);
+  std::vector<int> DataC(WIDTH * HEIGHT, 1);
+
+  S::buffer<int, 2> BufA{DataA.data(), S::range<2>{WIDTH, HEIGHT}};
+  S::buffer<int, 2> BufB{DataB.data(), S::range<2>{WIDTH, HEIGHT}};
+  S::buffer<int, 2> BufC{DataC.data(), S::range<2>{WIDTH, HEIGHT}};
+
+  auto CG1 = [&](S::handler &CGH) {
+    auto AccA = BufA.get_access<S::access::mode::read>(CGH);
+    auto AccB = BufB.get_access<S::access::mode::read>(CGH);
+    auto AccC = BufC.get_access<S::access::mode::read_write>(CGH);
+    auto Kernel = [=](S::nd_item<2> Item) {
+      size_t W = Item.get_global_id(0);
+      size_t H = Item.get_global_id(1);
+      AccC[W][H] += AccA[W][H] * AccB[W][H];
+    };
+    CGH.parallel_for<class K1>(S::nd_range<2>({WIDTH, HEIGHT}, {1, 1}), Kernel);
+  };
+
+  auto CG2 = [&](S::handler &CGH) {
+    auto AccA = BufA.get_access<sycl::access::mode::read>(CGH);
+    auto AccB = BufB.get_access<sycl::access::mode::read>(CGH);
+    auto AccC = BufC.get_access<sycl::access::mode::read_write>(CGH);
+
+    CGH.codeplay_host_task([=] {
+      for (size_t I = 0; I < WIDTH; ++I)
+        for (size_t J = 0; J < HEIGHT; ++J) {
+          std::cout << "C[" << I << "][" << J << "] = " << AccC[I][J]
+                    << std::endl;
+        }
+    });
+  };
+
+  static const size_t NTIMES = 100;
+
+  for (size_t Idx = 0; Idx < NTIMES; ++Idx) {
+    Q1.submit(CG1);
+    Q2.submit(CG2);
+    Q2.submit(CG1);
+    Q1.submit(CG2);
+  }
+
+  Q1.wait_and_throw();
+  Q2.wait_and_throw();
+
+  for (size_t I = 0; I < WIDTH; ++I)
+    for (size_t J = 0; J < HEIGHT; ++J)
+      assert(DataC[I * HEIGHT + J] == (1 + 2 * 3 * NTIMES * 2));
+}
+
+int main(void) {
+  test();
+  return 0;
+}
diff --git a/sycl/test/lit.cfg.py b/sycl/test/lit.cfg.py
index c63efd551473b..4baaeac0f1dd3 100644
--- a/sycl/test/lit.cfg.py
+++ b/sycl/test/lit.cfg.py
@@ -58,6 +58,7 @@
 
 llvm_config.with_environment('PATH', config.sycl_tools_dir, append_path=True)
 
+config.substitutions.append( ('%threads_lib', config.sycl_threads_lib) )
 config.substitutions.append( ('%sycl_libs_dir',  config.sycl_libs_dir ) )
 config.substitutions.append( ('%sycl_include',  config.sycl_include ) )
 config.substitutions.append( ('%sycl_source_dir', config.sycl_source_dir) )
diff --git a/sycl/test/lit.site.cfg.py.in b/sycl/test/lit.site.cfg.py.in
index 82281dd5cb654..a8d456eddd3b5 100644
--- a/sycl/test/lit.site.cfg.py.in
+++ b/sycl/test/lit.site.cfg.py.in
@@ -20,6 +20,7 @@ config.llvm_build_bin_dir = "@LLVM_BUILD_BINARY_DIRS@"
 
 config.llvm_enable_projects = "@LLVM_ENABLE_PROJECTS@"
 
+config.sycl_threads_lib = '@SYCL_THREADS_LIB@'
 
 import lit.llvm
 lit.llvm.initialize(lit_config, config)