diff --git a/sycl/include/CL/sycl/detail/cg.hpp b/sycl/include/CL/sycl/detail/cg.hpp
index 6bdde8795fac0..f1e6317508123 100644
--- a/sycl/include/CL/sycl/detail/cg.hpp
+++ b/sycl/include/CL/sycl/detail/cg.hpp
@@ -305,9 +305,16 @@ class CGInteropTask : public CG {
 class CGHostTask : public CG {
 public:
   std::unique_ptr<HostTask> MHostTask;
+  // queue for host-interop task
+  shared_ptr_class<detail::queue_impl> MQueue;
+  // context for host-interop task
+  shared_ptr_class<detail::context_impl> MContext;
   vector_class<ArgDesc> MArgs;
 
-  CGHostTask(std::unique_ptr<HostTask> HostTask, vector_class<ArgDesc> Args,
+  CGHostTask(std::unique_ptr<HostTask> HostTask,
+             std::shared_ptr<detail::queue_impl> Queue,
+             std::shared_ptr<detail::context_impl> Context,
+             vector_class<ArgDesc> Args,
              std::vector<std::vector<char>> ArgsStorage,
              std::vector<detail::AccessorImplPtr> AccStorage,
              std::vector<std::shared_ptr<const void>> SharedPtrStorage,
@@ -317,7 +324,8 @@ class CGHostTask : public CG {
       : CG(Type, std::move(ArgsStorage), std::move(AccStorage),
            std::move(SharedPtrStorage), std::move(Requirements),
            std::move(Events), std::move(loc)),
-        MHostTask(std::move(HostTask)), MArgs(std::move(Args)) {}
+        MHostTask(std::move(HostTask)), MQueue(Queue), MContext(Context),
+        MArgs(std::move(Args)) {}
 };
 
 class CGBarrier : public CG {
diff --git a/sycl/include/CL/sycl/detail/cg_types.hpp b/sycl/include/CL/sycl/detail/cg_types.hpp
index f4df3eeb783e6..b104d8c5c0d63 100644
--- a/sycl/include/CL/sycl/detail/cg_types.hpp
+++ b/sycl/include/CL/sycl/detail/cg_types.hpp
@@ -12,6 +12,7 @@
 #include <CL/sycl/detail/kernel_desc.hpp>
 #include <CL/sycl/group.hpp>
 #include <CL/sycl/id.hpp>
+#include <CL/sycl/interop_handle.hpp>
 #include <CL/sycl/interop_handler.hpp>
 #include <CL/sycl/kernel.hpp>
 #include <CL/sycl/nd_item.hpp>
@@ -143,12 +144,17 @@ class InteropTask {
 
 class HostTask {
   std::function<void()> MHostTask;
+  std::function<void(interop_handle)> MInteropTask;
 
 public:
   HostTask() : MHostTask([]() {}) {}
   HostTask(std::function<void()> &&Func) : MHostTask(Func) {}
+  HostTask(std::function<void(interop_handle)> &&Func) : MInteropTask(Func) {}
+
+  bool isInteropTask() const { return !!MInteropTask; }
 
   void call() { MHostTask(); }
+  void call(interop_handle handle) { MInteropTask(handle); }
 };
 
 // Class which stores specific lambda object.
diff --git a/sycl/include/CL/sycl/detail/sycl_mem_obj_i.hpp b/sycl/include/CL/sycl/detail/sycl_mem_obj_i.hpp
index 1050d75e22c88..ced079610409c 100644
--- a/sycl/include/CL/sycl/detail/sycl_mem_obj_i.hpp
+++ b/sycl/include/CL/sycl/detail/sycl_mem_obj_i.hpp
@@ -69,6 +69,7 @@ class SYCLMemObjI {
   // which is unavailable.
   shared_ptr_class<MemObjRecord> MRecord;
   friend class Scheduler;
+  friend class ExecCGCommand;
 };
 
 } // namespace detail
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index 5ff72f711f0d2..43642b75a3497 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -19,6 +19,7 @@
 #include <CL/sycl/detail/os_util.hpp>
 #include <CL/sycl/event.hpp>
 #include <CL/sycl/id.hpp>
+#include <CL/sycl/interop_handle.hpp>
 #include <CL/sycl/item.hpp>
 #include <CL/sycl/kernel.hpp>
 #include <CL/sycl/nd_item.hpp>
@@ -856,8 +857,22 @@ class __SYCL_EXPORT handler {
   }
 
   template <typename FuncT>
-  typename std::enable_if<detail::check_fn_signature<
-      typename std::remove_reference<FuncT>::type, void()>::value>::type
+  detail::enable_if_t<detail::check_fn_signature<
+      detail::remove_reference_t<FuncT>, void()>::value>
+  codeplay_host_task(FuncT Func) {
+    throwIfActionIsCreated();
+
+    MNDRDesc.set(range<1>(1));
+    MArgs = std::move(MAssociatedAccesors);
+
+    MHostTask.reset(new detail::HostTask(std::move(Func)));
+
+    MCGType = detail::CG::CODEPLAY_HOST_TASK;
+  }
+
+  template <typename FuncT>
+  detail::enable_if_t<detail::check_fn_signature<
+      detail::remove_reference_t<FuncT>, void(interop_handle)>::value>
   codeplay_host_task(FuncT Func) {
     throwIfActionIsCreated();
 
diff --git a/sycl/include/CL/sycl/interop_handle.hpp b/sycl/include/CL/sycl/interop_handle.hpp
index 3296ab783bebf..a5776c26c0be7 100644
--- a/sycl/include/CL/sycl/interop_handle.hpp
+++ b/sycl/include/CL/sycl/interop_handle.hpp
@@ -84,8 +84,13 @@ class interop_handle {
   template <backend BackendName = backend::opencl>
   auto get_native_queue() const noexcept ->
       typename interop<BackendName, queue>::type {
+#ifndef __SYCL_DEVICE_ONLY__
     return reinterpret_cast<typename interop<BackendName, queue>::type>(
         getNativeQueue());
+#else
+    // we believe this won't be ever called on device side
+    return nullptr;
+#endif
   }
 
   /// Returns an underlying OpenCL device associated with the SYCL queue used
@@ -94,8 +99,13 @@ class interop_handle {
   template <backend BackendName = backend::opencl>
   auto get_native_device() const noexcept ->
       typename interop<BackendName, device>::type {
+#ifndef __SYCL_DEVICE_ONLY__
     return reinterpret_cast<typename interop<BackendName, device>::type>(
         getNativeDevice());
+#else
+    // we believe this won't be ever called on device side
+    return nullptr;
+#endif
   }
 
   /// Returns an underlying OpenCL context associated with the SYCL queue used
@@ -104,14 +114,20 @@ class interop_handle {
   template <backend BackendName = backend::opencl>
   auto get_native_context() const noexcept ->
       typename interop<BackendName, context>::type {
+#ifndef __SYCL_DEVICE_ONLY__
     return reinterpret_cast<typename interop<BackendName, context>::type>(
         getNativeContext());
+#else
+    // we believe this won't be ever called on device side
+    return nullptr;
+#endif
   }
 
 private:
+  friend class detail::ExecCGCommand;
+  friend class detail::DispatchHostTask;
   using ReqToMem = std::pair<detail::Requirement *, pi_mem>;
 
-public:
   // TODO set c-tor private
   interop_handle(std::vector<ReqToMem> MemObjs,
                  const std::shared_ptr<detail::queue_impl> &Queue,
@@ -131,10 +147,10 @@ class interop_handle {
         getNativeMem(Req));
   }
 
-  pi_native_handle getNativeMem(detail::Requirement *Req) const;
-  pi_native_handle getNativeQueue() const;
-  pi_native_handle getNativeDevice() const;
-  pi_native_handle getNativeContext() const;
+  __SYCL_EXPORT pi_native_handle getNativeMem(detail::Requirement *Req) const;
+  __SYCL_EXPORT pi_native_handle getNativeQueue() const;
+  __SYCL_EXPORT pi_native_handle getNativeDevice() const;
+  __SYCL_EXPORT pi_native_handle getNativeContext() const;
 
   std::shared_ptr<detail::queue_impl> MQueue;
   std::shared_ptr<detail::device_impl> MDevice;
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index c1b8063d6a3c7..81870a3ee6afa 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -8,8 +8,9 @@
 
 #include <detail/error_handling/error_handling.hpp>
 
-#include "CL/sycl/access/access.hpp"
+#include <CL/sycl/access/access.hpp>
 #include <CL/sycl/backend_types.hpp>
+#include <CL/sycl/detail/cg_types.hpp>
 #include <CL/sycl/detail/cl.h>
 #include <CL/sycl/detail/kernel_desc.hpp>
 #include <CL/sycl/detail/memory_manager.hpp>
@@ -159,6 +160,7 @@ getPiEvents(const std::vector<EventImplPtr> &EventImpls) {
 
 class DispatchHostTask {
   ExecCGCommand *MThisCmd;
+  std::vector<interop_handle::ReqToMem> MReqToMem;
 
   void waitForEvents() const {
     std::map<const detail::plugin *, std::vector<EventImplPtr>>
@@ -187,7 +189,9 @@ class DispatchHostTask {
   }
 
 public:
-  DispatchHostTask(ExecCGCommand *ThisCmd) : MThisCmd{ThisCmd} {}
+  DispatchHostTask(ExecCGCommand *ThisCmd,
+                   std::vector<interop_handle::ReqToMem> ReqToMem)
+      : MThisCmd{ThisCmd}, MReqToMem(std::move(ReqToMem)) {}
 
   void operator()() const {
     waitForEvents();
@@ -197,7 +201,15 @@ class DispatchHostTask {
     CGHostTask &HostTask = static_cast<CGHostTask &>(MThisCmd->getCG());
 
     // we're ready to call the user-defined lambda now
-    HostTask.MHostTask->call();
+    if (HostTask.MHostTask->isInteropTask()) {
+      interop_handle IH{MReqToMem, HostTask.MQueue,
+                        getSyclObjImpl(HostTask.MQueue->get_device()),
+                        HostTask.MQueue->getContextImplPtr()};
+
+      HostTask.MHostTask->call(IH);
+    } else
+      HostTask.MHostTask->call();
+
     HostTask.MHostTask.reset();
 
     // unblock user empty command here
@@ -1943,7 +1955,38 @@ cl_int ExecCGCommand::enqueueImp() {
       }
     }
 
-    MQueue->getThreadPool().submit<DispatchHostTask>(DispatchHostTask(this));
+    std::vector<interop_handle::ReqToMem> ReqToMem;
+
+    if (HostTask->MHostTask->isInteropTask()) {
+      // Extract the Mem Objects for all Requirements, to ensure they are
+      // available if a user asks for them inside the interop task scope
+      const std::vector<Requirement *> &HandlerReq = HostTask->MRequirements;
+      auto ReqToMemConv = [&ReqToMem, HostTask](Requirement *Req) {
+        const std::vector<AllocaCommandBase *> &AllocaCmds =
+            Req->MSYCLMemObj->MRecord->MAllocaCommands;
+
+        for (AllocaCommandBase *AllocaCmd : AllocaCmds)
+          if (HostTask->MQueue == AllocaCmd->getQueue()) {
+            auto MemArg =
+                reinterpret_cast<pi_mem>(AllocaCmd->getMemAllocation());
+            ReqToMem.emplace_back(std::make_pair(Req, MemArg));
+
+            return;
+          }
+
+        assert(false &&
+               "Can't get memory object due to no allocation available");
+
+        throw runtime_error(
+            "Can't get memory object due to no allocation available",
+            PI_INVALID_MEM_OBJECT);
+      };
+      std::for_each(std::begin(HandlerReq), std::end(HandlerReq), ReqToMemConv);
+      std::sort(std::begin(ReqToMem), std::end(ReqToMem));
+    }
+
+    MQueue->getThreadPool().submit<DispatchHostTask>(
+        DispatchHostTask(this, std::move(ReqToMem)));
 
     MShouldCompleteEventIfPossible = false;
 
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index cd588cf3c3742..4931c4a41731a 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -679,6 +679,16 @@ Scheduler::GraphBuilder::addEmptyCmd(Command *Cmd, const std::vector<T *> &Reqs,
   return EmptyCmd;
 }
 
+static bool isInteropHostTask(const std::unique_ptr<ExecCGCommand> &Cmd) {
+  if (Cmd->getCG().getType() != CG::CGTYPE::CODEPLAY_HOST_TASK)
+    return false;
+
+  const detail::CGHostTask &HT =
+      static_cast<detail::CGHostTask &>(Cmd->getCG());
+
+  return HT.MHostTask->isInteropTask();
+}
+
 Command *
 Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
                                QueueImplPtr Queue) {
@@ -695,13 +705,29 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
     printGraphAsDot("before_addCG");
 
   for (Requirement *Req : Reqs) {
-    MemObjRecord *Record = getOrInsertMemObjRecord(Queue, Req);
-    markModifiedIfWrite(Record, Req);
+    MemObjRecord *Record = nullptr;
+    AllocaCommandBase *AllocaCmd = nullptr;
+
+    bool isSameCtx = false;
+
+    {
+      const QueueImplPtr &QueueForAlloca =
+          isInteropHostTask(NewCmd)
+              ? static_cast<detail::CGHostTask &>(NewCmd->getCG()).MQueue
+              : Queue;
+
+      Record = getOrInsertMemObjRecord(QueueForAlloca, Req);
+      markModifiedIfWrite(Record, Req);
+
+      AllocaCmd = getOrCreateAllocaForReq(Record, Req, QueueForAlloca);
+
+      isSameCtx =
+          sameCtx(QueueForAlloca->getContextImplPtr(), Record->MCurContext);
+    }
 
-    AllocaCommandBase *AllocaCmd = getOrCreateAllocaForReq(Record, Req, Queue);
     // If there is alloca command we need to check if the latest memory is in
     // required context.
-    if (sameCtx(Queue->getContextImplPtr(), Record->MCurContext)) {
+    if (isSameCtx) {
       // If the memory is already in the required host context, check if the
       // required access mode is valid, remap if not.
       if (Record->MCurContext->is_host() &&
@@ -710,10 +736,24 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
     } else {
       // Cannot directly copy memory from OpenCL device to OpenCL device -
       // create two copies: device->host and host->device.
-      if (!Queue->is_host() && !Record->MCurContext->is_host())
+      bool NeedMemMoveToHost = false;
+      auto MemMoveTargetQueue = Queue;
+
+      if (isInteropHostTask(NewCmd)) {
+        const detail::CGHostTask &HT =
+            static_cast<detail::CGHostTask &>(NewCmd->getCG());
+
+        if (HT.MQueue->getContextImplPtr() != Record->MCurContext) {
+          NeedMemMoveToHost = true;
+          MemMoveTargetQueue = HT.MQueue;
+        }
+      } else if (!Queue->is_host() && !Record->MCurContext->is_host())
+        NeedMemMoveToHost = true;
+
+      if (NeedMemMoveToHost)
         insertMemoryMove(Record, Req,
                          Scheduler::getInstance().getDefaultHostQueue());
-      insertMemoryMove(Record, Req, Queue);
+      insertMemoryMove(Record, Req, MemMoveTargetQueue);
     }
     std::set<Command *> Deps =
         findDepsForReq(Record, Req, Queue->getContextImplPtr());
@@ -927,10 +967,11 @@ void Scheduler::GraphBuilder::connectDepEvent(Command *const Cmd,
   {
     std::unique_ptr<detail::HostTask> HT(new detail::HostTask);
     std::unique_ptr<detail::CG> ConnectCG(new detail::CGHostTask(
-        std::move(HT), /* Args = */ {}, /* ArgsStorage = */ {},
-        /* AccStorage = */ {}, /* SharedPtrStorage = */ {},
-        /* Requirements = */ {}, /* DepEvents = */ {DepEvent},
-        CG::CODEPLAY_HOST_TASK, /* Payload */ {}));
+        std::move(HT), /* Queue = */ {}, /* Context = */ {}, /* Args = */ {},
+        /* ArgsStorage = */ {}, /* AccStorage = */ {},
+        /* SharedPtrStorage = */ {}, /* Requirements = */ {},
+        /* DepEvents = */ {DepEvent}, CG::CODEPLAY_HOST_TASK,
+        /* Payload */ {}));
     ConnectCmd = new ExecCGCommand(
         std::move(ConnectCG), Scheduler::getInstance().getDefaultHostQueue());
   }
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index 9939638cc851b..7ad8e2bf88b76 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -85,9 +85,10 @@ event handler::finalize() {
     break;
   case detail::CG::CODEPLAY_HOST_TASK:
     CommandGroup.reset(new detail::CGHostTask(
-        std::move(MHostTask), std::move(MArgs), std::move(MArgsStorage),
-        std::move(MAccStorage), std::move(MSharedPtrStorage),
-        std::move(MRequirements), std::move(MEvents), MCGType, MCodeLoc));
+        std::move(MHostTask), MQueue, MQueue->getContextImplPtr(),
+        std::move(MArgs), std::move(MArgsStorage), std::move(MAccStorage),
+        std::move(MSharedPtrStorage), std::move(MRequirements),
+        std::move(MEvents), MCGType, MCodeLoc));
     break;
   case detail::CG::BARRIER:
   case detail::CG::BARRIER_WAITLIST:
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index e975f36913e2a..69136d1c1b774 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -3320,6 +3320,10 @@ _ZNK2cl4sycl13host_selectorclERKNS0_6deviceE
 _ZNK2cl4sycl14exception_list3endEv
 _ZNK2cl4sycl14exception_list4sizeEv
 _ZNK2cl4sycl14exception_list5beginEv
+_ZNK2cl4sycl14interop_handle12getNativeMemEPNS0_6detail16AccessorImplHostE
+_ZNK2cl4sycl14interop_handle14getNativeQueueEv
+_ZNK2cl4sycl14interop_handle15getNativeDeviceEv
+_ZNK2cl4sycl14interop_handle16getNativeContextEv
 _ZNK2cl4sycl15device_selector13select_deviceEv
 _ZNK2cl4sycl15interop_handler12GetNativeMemEPNS0_6detail16AccessorImplHostE
 _ZNK2cl4sycl15interop_handler14GetNativeQueueEv
diff --git a/sycl/test/host-interop-task/host-task.cpp b/sycl/test/host-interop-task/host-task.cpp
new file mode 100644
index 0000000000000..3f981058c02bb
--- /dev/null
+++ b/sycl/test/host-interop-task/host-task.cpp
@@ -0,0 +1,64 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out %threads_lib -lOpenCL
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+
+using namespace cl::sycl;
+using namespace cl::sycl::access;
+
+static constexpr size_t BUFFER_SIZE = 1024;
+
+// Check that a single host-task with a buffer will work
+void test1() {
+  buffer<int, 1> Buffer{BUFFER_SIZE};
+
+  queue Q;
+
+  Q.submit([&](handler &CGH) {
+    auto Acc = Buffer.get_access<mode::write>(CGH);
+    CGH.codeplay_host_task([=] {
+      // A no-op
+    });
+  });
+}
+
+void test2() {
+  buffer<int, 1> Buffer1{BUFFER_SIZE};
+  buffer<int, 1> Buffer2{BUFFER_SIZE};
+
+  queue Q;
+
+  Q.submit([&](handler &CGH) {
+    auto Acc = Buffer1.template get_access<mode::write>(CGH);
+
+    auto Kernel = [=](item<1> Id) { Acc[Id] = 123; };
+    CGH.parallel_for<class Test6Init>(Acc.get_count(), Kernel);
+  });
+
+  Q.submit([&](handler &CGH) {
+    auto AccSrc = Buffer1.template get_access<mode::read>(CGH);
+    auto AccDst = Buffer2.template get_access<mode::write>(CGH);
+
+    CGH.codeplay_host_task([=] {
+      for (size_t Idx = 0; Idx < AccDst.get_count(); ++Idx)
+        AccDst[Idx] = AccSrc[Idx];
+    });
+  });
+
+  {
+    auto Acc = Buffer2.get_access<mode::read>();
+
+    for (size_t Idx = 0; Idx < Acc.get_count(); ++Idx) {
+      std::cout << "Second buffer [" << Idx << "] = " << Acc[Idx] << std::endl;
+      assert(Acc[Idx] == 123);
+    }
+  }
+}
+
+int main() {
+  test1();
+  test2();
+  return 0;
+}
diff --git a/sycl/test/host-interop-task/interop-task.cpp b/sycl/test/host-interop-task/interop-task.cpp
new file mode 100644
index 0000000000000..6db65398e06bb
--- /dev/null
+++ b/sycl/test/host-interop-task/interop-task.cpp
@@ -0,0 +1,229 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out %threads_lib -lOpenCL
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// USUPPORTED: level0, cuda
+// REQUIRES: opencl
+
+#include <CL/sycl.hpp>
+#include <CL/sycl/backend/opencl.hpp>
+#include <CL/sycl/detail/cl.h>
+
+using namespace cl::sycl;
+using namespace cl::sycl::access;
+
+static constexpr size_t BUFFER_SIZE = 1024;
+
+template <typename T>
+class Modifier;
+
+template <typename T>
+class Init;
+
+template <typename DataT>
+void copy(buffer<DataT, 1> &Src, buffer<DataT, 1> &Dst, queue &Q) {
+  Q.submit([&](handler &CGH) {
+    auto SrcA = Src.template get_access<mode::read>(CGH);
+    auto DstA = Dst.template get_access<mode::write>(CGH);
+
+    CGH.codeplay_host_task([=](interop_handle IH) {
+      auto NativeQ = IH.get_native_queue();
+      auto SrcMem = IH.get_native_mem(SrcA);
+      auto DstMem = IH.get_native_mem(DstA);
+      cl_event Event;
+
+      int RC = clEnqueueCopyBuffer(
+          NativeQ, SrcMem, DstMem, 0, 0, sizeof(DataT) * SrcA.get_count(), 0,
+          nullptr, &Event);
+
+      if (RC != CL_SUCCESS)
+        throw runtime_error("Can't enqueue buffer copy", RC);
+
+      RC = clWaitForEvents(1, &Event);
+
+      if (RC != CL_SUCCESS)
+        throw runtime_error("Can't wait for event on buffer copy", RC);
+    });
+  });
+}
+
+template <typename DataT>
+void modify(buffer<DataT, 1> &B, queue &Q) {
+  Q.submit([&](handler &CGH) {
+    auto Acc = B.template get_access<mode::read_write>(CGH);
+
+    auto Kernel = [=](item<1> Id) { Acc[Id] += 1; };
+
+    CGH.parallel_for<Modifier<DataT>>(Acc.get_count(), Kernel);
+  });
+}
+
+template <typename DataT, DataT B1Init, DataT B2Init>
+void init(buffer<DataT, 1> &B1, buffer<DataT, 1> &B2, queue &Q) {
+  Q.submit([&](handler &CGH) {
+    auto Acc1 = B1.template get_access<mode::write>(CGH);
+    auto Acc2 = B2.template get_access<mode::write>(CGH);
+
+    CGH.parallel_for<Init<DataT>>(BUFFER_SIZE, [=](item<1> Id) {
+      Acc1[Id] = -1;
+      Acc2[Id] = -2;
+    });
+  });
+}
+
+// A test that uses OpenCL interop to copy data from buffer A to buffer B, by
+// getting cl_mem objects and calling the clEnqueueBufferCopy. Then run a SYCL
+// kernel that modifies the data in place for B, e.g. increment one, then copy
+// back to buffer A. Run it on a loop, to ensure the dependencies and the
+// reference counting of the objects is not leaked.
+void test1() {
+  static constexpr int COUNT = 4;
+  queue Q;
+  buffer<int, 1> Buffer1{BUFFER_SIZE};
+  buffer<int, 1> Buffer2{BUFFER_SIZE};
+
+  // init the buffer with a'priori invalid data
+  init<int, -1, -2>(Buffer1, Buffer2, Q);
+
+  // Repeat a couple of times
+  for (size_t Idx = 0; Idx < COUNT; ++Idx) {
+    copy(Buffer1, Buffer2, Q);
+    modify(Buffer2, Q);
+    copy(Buffer2, Buffer1, Q);
+  }
+
+  {
+    auto Acc = Buffer1.get_access<mode::read>();
+
+    for (size_t Idx = 0; Idx < Acc.get_count(); ++Idx) {
+      std::cout << "First buffer [" << Idx << "] = " << Acc[Idx] << std::endl;
+      assert((Acc[Idx] == COUNT - 1) && "Invalid data in the first buffer");
+    }
+  }
+  {
+    auto Acc = Buffer2.get_access<mode::read>();
+
+    for (size_t Idx = 0; Idx < Acc.get_count(); ++Idx) {
+      std::cout << "Second buffer [" << Idx << "] = " << Acc[Idx] << std::endl;
+      assert((Acc[Idx] == COUNT - 1) && "Invalid data in the second buffer");
+    }
+  }
+}
+
+// Same as above, but performing each command group on a separate SYCL queue
+// (on the same or different devices). This ensures the dependency tracking
+// works well but also there is no accidental side effects on other queues.
+void test2() {
+  static constexpr int COUNT = 4;
+  buffer<int, 1> Buffer1{BUFFER_SIZE};
+  buffer<int, 1> Buffer2{BUFFER_SIZE};
+
+  // init the buffer with a'priori invalid data
+  {
+    queue Q;
+    init<int, -1, -2>(Buffer1, Buffer2, Q);
+  }
+
+  // Repeat a couple of times
+  for (size_t Idx = 0; Idx < COUNT; ++Idx) {
+    queue Q;
+    copy(Buffer1, Buffer2, Q);
+    modify(Buffer2, Q);
+    copy(Buffer2, Buffer1, Q);
+  }
+
+  {
+    auto Acc = Buffer1.get_access<mode::read>();
+
+    for (size_t Idx = 0; Idx < Acc.get_count(); ++Idx) {
+      std::cout << "First buffer [" << Idx << "] = " << Acc[Idx] << std::endl;
+      assert((Acc[Idx] == COUNT - 1) && "Invalid data in the first buffer");
+    }
+  }
+  {
+    auto Acc = Buffer2.get_access<mode::read>();
+
+    for (size_t Idx = 0; Idx < Acc.get_count(); ++Idx) {
+      std::cout << "Second buffer [" << Idx << "] = " << Acc[Idx] << std::endl;
+      assert((Acc[Idx] == COUNT - 1) && "Invalid data in the second buffer");
+    }
+  }
+}
+
+// A test that does a clEnqueueWait inside the interop scope, for an event
+// captured outside the command group. The OpenCL event can be set after the
+// command group finishes. Must not deadlock according to implementation and
+// proposal
+void test3() {
+  // Want some large buffer for operation to take long
+  buffer<int, 1> Buffer{BUFFER_SIZE * 128};
+
+  queue Q;
+
+  event Event = Q.submit([&](handler &CGH) {
+    auto Acc1 = Buffer.get_access<mode::write>(CGH);
+
+    CGH.parallel_for<class Init3>(BUFFER_SIZE, [=](item<1> Id) {
+      Acc1[Id] = 123;
+    });
+  });
+
+  Q.submit([&](handler &CGH) {
+    CGH.codeplay_host_task([=](interop_handle IH) {
+      cl_event Ev = Event.get();
+
+      int RC = clWaitForEvents(1, &Ev);
+
+      if (RC != CL_SUCCESS)
+        throw runtime_error("Can't wait for events", RC);
+    });
+  });
+}
+
+// Check that a single host-interop-task with a buffer will work
+void test4() {
+  buffer<int, 1> Buffer{BUFFER_SIZE};
+
+  queue Q;
+
+  Q.submit([&](handler &CGH) {
+    auto Acc = Buffer.get_access<mode::write>(CGH);
+    CGH.codeplay_host_task([=](interop_handle IH) {
+      // A no-op
+    });
+  });
+}
+
+void test5() {
+  buffer<int, 1> Buffer1{BUFFER_SIZE};
+  buffer<int, 1> Buffer2{BUFFER_SIZE};
+
+  queue Q;
+
+  Q.submit([&](handler &CGH) {
+    auto Acc = Buffer1.template get_access<mode::write>(CGH);
+
+    auto Kernel = [=](item<1> Id) { Acc[Id] = 123; };
+    CGH.parallel_for<class Test5Init>(Acc.get_count(), Kernel);
+  });
+
+  copy(Buffer1, Buffer2, Q);
+
+  {
+    auto Acc = Buffer2.get_access<mode::read>();
+
+    for (size_t Idx = 0; Idx < Acc.get_count(); ++Idx) {
+      std::cout << "Second buffer [" << Idx << "] = " << Acc[Idx] << std::endl;
+      assert(Acc[Idx] == 123);
+    }
+  }
+}
+
+int main() {
+  test1();
+  test2();
+  test3();
+  test4();
+  test5();
+  return 0;
+}