diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index 941e878ebb591..49767539bc683 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -107,32 +107,6 @@ template <typename Type> struct get_kernel_name_t<detail::auto_name, Type> {
 
 __SYCL_EXPORT device getDeviceFromHandler(handler &);
 
-/// These are the forward declaration for the classes that help to create
-/// names for additional kernels. It is used only when there are
-/// more then 1 kernels in one parallel_for() implementing SYCL reduction.
-template <typename Type> class __sycl_reduction_main_2nd_kernel;
-template <typename Type> class __sycl_reduction_aux_1st_kernel;
-template <typename Type> class __sycl_reduction_aux_2nd_kernel;
-
-/// Helper structs to get additional kernel name types based on given
-/// \c Name and \c Type types: if \c Name is undefined (is a \c auto_name) then
-/// \c Type becomes the \c Name.
-template <typename Name, typename Type>
-struct get_reduction_main_2nd_kernel_name_t {
-  using name = __sycl_reduction_main_2nd_kernel<
-      typename get_kernel_name_t<Name, Type>::name>;
-};
-template <typename Name, typename Type>
-struct get_reduction_aux_1st_kernel_name_t {
-  using name = __sycl_reduction_aux_1st_kernel<
-      typename get_kernel_name_t<Name, Type>::name>;
-};
-template <typename Name, typename Type>
-struct get_reduction_aux_2nd_kernel_name_t {
-  using name = __sycl_reduction_aux_2nd_kernel<
-      typename get_kernel_name_t<Name, Type>::name>;
-};
-
 device getDeviceFromHandler(handler &);
 
 } // namespace detail
@@ -142,6 +116,14 @@ namespace detail {
 template <typename T, class BinaryOperation, int Dims, access::mode AccMode,
           access::placeholder IsPlaceholder>
 class reduction_impl;
+
+template <typename KernelName, typename KernelType, int Dims, class Reduction>
+void reduCGFunc(handler &CGH, KernelType KernelFunc,
+                const nd_range<Dims> &Range, Reduction &Redu);
+
+template <typename KernelName, typename KernelType, int Dims, class Reduction>
+void reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
+                   size_t KernelRun, Reduction &Redu);
 } // namespace detail
 } // namespace intel
 
@@ -231,13 +213,6 @@ class __SYCL_EXPORT handler {
   /// usage in finalize() method.
   void saveCodeLoc(detail::code_location CodeLoc) { MCodeLoc = CodeLoc; }
 
-  /// Stores the given \param Event to the \param Queue.
-  /// Even though MQueue is a field of handler, the method addEvent() of
-  /// queue_impl class cannot be called inside this handler.hpp file
-  /// as queue_impl is incomplete class for handler.
-  static void addEventToQueue(shared_ptr_class<detail::queue_impl> Queue,
-                              cl::sycl::event Event);
-
   /// Constructs CG object of specific type, passes it to Scheduler and
   /// returns sycl::event object representing the command group.
   /// It's expected that the method is the latest method executed before
@@ -288,30 +263,6 @@ class __SYCL_EXPORT handler {
                                      /*index*/ 0);
   }
 
-  template <typename DataT, int Dims, access::mode AccessMode,
-            access::target AccessTarget>
-  void dissociateWithHandler(accessor<DataT, Dims, AccessMode, AccessTarget,
-                                      access::placeholder::false_t>
-                                 Acc) {
-    detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Acc;
-    detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
-    detail::Requirement *Req = AccImpl.get();
-
-    // Remove accessor from the list of requirements, accessors storage,
-    // and from the list of associated accessors.
-    auto ReqIt = std::find(MRequirements.begin(), MRequirements.end(), Req);
-    auto AccIt = std::find(MAccStorage.begin(), MAccStorage.end(), AccImpl);
-    auto It =
-        std::find_if(MAssociatedAccesors.begin(), MAssociatedAccesors.end(),
-                     [Req](const detail::ArgDesc &D) { return D.MPtr == Req; });
-    assert((ReqIt != MRequirements.end() && AccIt != MAccStorage.end() &&
-            It != MAssociatedAccesors.end()) &&
-           "Cannot dissociate accessor.");
-    MRequirements.erase(ReqIt);
-    MAccStorage.erase(AccIt);
-    MAssociatedAccesors.erase(It);
-  }
-
   // Recursively calls itself until arguments pack is fully processed.
   // The version for regular(standard layout) argument.
   template <typename T, typename... Ts>
@@ -810,219 +761,6 @@ class __SYCL_EXPORT handler {
 #endif
   }
 
-  /// Implements a command group function that enqueues a kernel that calls
-  /// user's lambda function \param KernelFunc and does one iteration of
-  /// reduction of elements in each of work-groups.
-  /// This version uses tree-reduction algorithm to reduce elements in each
-  /// of work-groups. At the end of each work-group the partial sum is written
-  /// to a global buffer.
-  ///
-  /// Briefly: user's lambda, tree-reduction, CUSTOM types/ops.
-  template <typename KernelName, typename KernelType, int Dims, class Reduction>
-  void reduCGFunc(KernelType KernelFunc, const nd_range<Dims> &Range,
-                  Reduction &Redu) {
-
-    size_t NWorkItems = Range.get_global_range().size();
-    size_t WGSize = Range.get_local_range().size();
-    size_t NWorkGroups = Range.get_group_range().size();
-
-    bool IsUnderLoaded = (NWorkGroups * WGSize - NWorkItems) != 0;
-    bool IsEfficientCase = !IsUnderLoaded && ((WGSize & (WGSize - 1)) == 0);
-
-    bool IsUpdateOfUserAcc =
-        Reduction::accessor_mode == access::mode::read_write &&
-        NWorkGroups == 1;
-
-    // Use local memory to reduce elements in work-groups into 0-th element.
-    // If WGSize is not power of two, then WGSize+1 elements are allocated.
-    // The additional last element is used to catch elements that could
-    // otherwise be lost in the tree-reduction algorithm.
-    size_t NumLocalElements = WGSize + (IsEfficientCase ? 0 : 1);
-    auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, *this);
-
-    auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, 0, *this);
-    auto ReduIdentity = Redu.getIdentity();
-    if (IsEfficientCase) {
-      // Efficient case: work-groups are fully loaded and work-group size
-      // is power of two.
-      parallel_for<KernelName>(Range, [=](nd_item<Dims> NDIt) {
-        // Call user's functions. Reducer.MValue gets initialized there.
-        typename Reduction::reducer_type Reducer(ReduIdentity);
-        KernelFunc(NDIt, Reducer);
-
-        // Copy the element to local memory to prepare it for tree-reduction.
-        size_t LID = NDIt.get_local_linear_id();
-        LocalReds[LID] = Reducer.MValue;
-        NDIt.barrier();
-
-        // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0].
-        typename Reduction::binary_operation BOp;
-        size_t WGSize = NDIt.get_local_range().size();
-        for (size_t CurStep = WGSize >> 1; CurStep > 0; CurStep >>= 1) {
-          if (LID < CurStep)
-            LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
-          NDIt.barrier();
-        }
-
-        // Compute the partial sum/reduction for the work-group.
-        if (LID == 0)
-          Out.get_pointer().get()[NDIt.get_group_linear_id()] =
-              IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), LocalReds[0])
-                                : LocalReds[0];
-      });
-    } else {
-      // Inefficient case: work-groups are not fully loaded
-      // or WGSize is not power of two.
-      // These two inefficient cases are handled by one kernel, which
-      // can be split later into two separate kernels, if there are users who
-      // really need more efficient code for them.
-      using AuxName = typename detail::get_reduction_main_2nd_kernel_name_t<
-          KernelName, KernelType>::name;
-      parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
-        // Call user's functions. Reducer.MValue gets initialized there.
-        typename Reduction::reducer_type Reducer(ReduIdentity);
-        KernelFunc(NDIt, Reducer);
-
-        size_t WGSize = NDIt.get_local_range().size();
-        size_t LID = NDIt.get_local_linear_id();
-        size_t GID = NDIt.get_global_linear_id();
-        // Copy the element to local memory to prepare it for tree-reduction.
-        LocalReds[LID] = (GID < NWorkItems) ? Reducer.MValue : ReduIdentity;
-        LocalReds[WGSize] = ReduIdentity;
-        NDIt.barrier();
-
-        // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]
-        // LocalReds[WGSize] accumulates last/odd elements when the step
-        // of tree-reduction loop is not even.
-        typename Reduction::binary_operation BOp;
-        size_t PrevStep = WGSize;
-        for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
-          if (LID < CurStep)
-            LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
-          else if (LID == CurStep && (PrevStep & 0x1))
-            LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
-          NDIt.barrier();
-          PrevStep = CurStep;
-        }
-
-        // Compute the partial sum/reduction for the work-group.
-        if (LID == 0) {
-          auto GrID = NDIt.get_group_linear_id();
-          auto V = BOp(LocalReds[0], LocalReds[WGSize]);
-          Out.get_pointer().get()[GrID] =
-              IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V;
-        }
-      });
-    }
-  }
-
-  /// Implements a command group function that enqueues a kernel that does one
-  /// iteration of reduction of elements in each of work-groups.
-  /// This version uses tree-reduction algorithm to reduce elements in each
-  /// of work-groups. At the end of each work-group the partial sum is written
-  /// to a global buffer.
-  ///
-  /// Briefly: aux kernel, tree-reduction, CUSTOM types/ops.
-  template <typename KernelName, typename KernelType, int Dims, class Reduction>
-  void reduAuxCGFunc(const nd_range<Dims> &Range, size_t NWorkItems,
-                     size_t KernelRun, Reduction &Redu) {
-    size_t WGSize = Range.get_local_range().size();
-    size_t NWorkGroups = Range.get_group_range().size();
-
-    // The last work-group may be not fully loaded with work, or the work group
-    // size may be not power of those. Those two cases considered inefficient
-    // as they require additional code and checks in the kernel.
-    bool IsUnderLoaded = NWorkGroups * WGSize != NWorkItems;
-    bool IsEfficientCase = !IsUnderLoaded && (WGSize & (WGSize - 1)) == 0;
-
-    bool IsUpdateOfUserAcc =
-        Reduction::accessor_mode == access::mode::read_write &&
-        NWorkGroups == 1;
-
-    // Use local memory to reduce elements in work-groups into 0-th element.
-    // If WGSize is not power of two, then WGSize+1 elements are allocated.
-    // The additional last element is used to catch elements that could
-    // otherwise be lost in the tree-reduction algorithm.
-    size_t NumLocalElements = WGSize + (IsEfficientCase ? 0 : 1);
-    auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, *this);
-
-    // Get read accessor to the buffer that was used as output
-    // in the previous kernel. After that create new output buffer if needed
-    // and get accessor to it (or use reduction's accessor if the kernel
-    // is the last one).
-    auto In = Redu.getReadAccToPreviousPartialReds(*this);
-    auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, KernelRun, *this);
-
-    if (IsEfficientCase) {
-      // Efficient case: work-groups are fully loaded and work-group size
-      // is power of two.
-      using AuxName = typename detail::get_reduction_aux_1st_kernel_name_t<
-          KernelName, KernelType>::name;
-      parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
-        // Copy the element to local memory to prepare it for tree-reduction.
-        size_t LID = NDIt.get_local_linear_id();
-        size_t GID = NDIt.get_global_linear_id();
-        LocalReds[LID] = In[GID];
-        NDIt.barrier();
-
-        // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]
-        typename Reduction::binary_operation BOp;
-        size_t WGSize = NDIt.get_local_range().size();
-        for (size_t CurStep = WGSize >> 1; CurStep > 0; CurStep >>= 1) {
-          if (LID < CurStep)
-            LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
-          NDIt.barrier();
-        }
-
-        // Compute the partial sum/reduction for the work-group.
-        if (LID == 0)
-          Out.get_pointer().get()[NDIt.get_group_linear_id()] =
-              IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), LocalReds[0])
-                                : LocalReds[0];
-      });
-    } else {
-      // Inefficient case: work-groups are not fully loaded
-      // or WGSize is not power of two.
-      // These two inefficient cases are handled by one kernel, which
-      // can be split later into two separate kernels, if there are users
-      // who really need more efficient code for them.
-      using AuxName = typename detail::get_reduction_aux_2nd_kernel_name_t<
-          KernelName, KernelType>::name;
-      auto ReduIdentity = Redu.getIdentity();
-      parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
-        size_t WGSize = NDIt.get_local_range().size();
-        size_t LID = NDIt.get_local_linear_id();
-        size_t GID = NDIt.get_global_linear_id();
-        // Copy the element to local memory to prepare it for tree-reduction
-        LocalReds[LID] = (GID < NWorkItems) ? In[GID] : ReduIdentity;
-        LocalReds[WGSize] = ReduIdentity;
-        NDIt.barrier();
-
-        // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]
-        // LocalReds[WGSize] accumulates last/odd elements when the step
-        // of tree-reduction loop is not even.
-        typename Reduction::binary_operation BOp;
-        size_t PrevStep = WGSize;
-        for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
-          if (LID < CurStep)
-            LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
-          else if (LID == CurStep && (PrevStep & 0x1))
-            LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
-          NDIt.barrier();
-          PrevStep = CurStep;
-        }
-
-        // Compute the partial sum/reduction for the work-group.
-        if (LID == 0) {
-          auto GrID = NDIt.get_group_linear_id();
-          auto V = BOp(LocalReds[0], LocalReds[WGSize]);
-          Out.get_pointer().get()[GrID] =
-              IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V;
-        }
-      });
-    }
-  }
-
   /// Defines and invokes a SYCL kernel function for the specified nd_range.
   /// Performs reduction operation specified in \param Redu.
   ///
@@ -1063,30 +801,23 @@ class __SYCL_EXPORT handler {
     //    necessary to reduce all partial sums into one final sum.
 
     // 1. Call the kernel that includes user's lambda function.
-    // If this kernel is going to be now last one, i.e. it does not write
-    // to user's accessor, then detach user's accessor from this kernel
-    // to make the dependencies between accessors and kernels more clean and
-    // correct.
-    if (NWorkGroups > 1)
-      dissociateWithHandler(Redu.MAcc);
-
-    reduCGFunc<KernelName>(KernelFunc, Range, Redu);
+    intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
     auto QueueCopy = MQueue;
     MLastEvent = this->finalize();
 
     // 2. Run the additional aux kernel as many times as needed to reduce
     // all partial sums into one scalar.
+
+    // TODO: user's nd_range and the work-group size specified there must
+    // be honored only for the main kernel that calls user's lambda functions.
+    // There is no need in using the same work-group size in these additional
+    // kernels. Thus, the better strategy here is to make the work-group size
+    // as big as possible to converge/reduce the partial sums into the last
+    // sum faster.
     size_t WGSize = Range.get_local_range().size();
     size_t NWorkItems = NWorkGroups;
     size_t KernelRun = 1;
     while (NWorkItems > 1) {
-      // Before creating another kernel, add the event from the previous kernel
-      // to queue.
-      addEventToQueue(QueueCopy, MLastEvent);
-
-      // TODO: here the work-group size is not limited by user's needs,
-      // the better strategy here is to make the work-group-size as big
-      // as possible.
       WGSize = std::min(WGSize, NWorkItems);
       NWorkGroups = NWorkItems / WGSize;
       // The last group may be not fully loaded. Still register it as a group.
@@ -1102,8 +833,8 @@ class __SYCL_EXPORT handler {
       // Associate it with handler manually.
       if (NWorkGroups == 1)
         AuxHandler.associateWithHandler(Redu.MAcc);
-      AuxHandler.reduAuxCGFunc<KernelName, KernelType>(Range, NWorkItems,
-                                                       KernelRun, Redu);
+      intel::detail::reduAuxCGFunc<KernelName, KernelType>(
+          AuxHandler, Range, NWorkItems, KernelRun, Redu);
       MLastEvent = AuxHandler.finalize();
 
       NWorkItems = NWorkGroups;
diff --git a/sycl/include/CL/sycl/intel/reduction.hpp b/sycl/include/CL/sycl/intel/reduction.hpp
index 3b6289cb03ecc..a22f7f38249d7 100644
--- a/sycl/include/CL/sycl/intel/reduction.hpp
+++ b/sycl/include/CL/sycl/intel/reduction.hpp
@@ -377,6 +377,248 @@ class reduction_impl {
   shared_ptr_class<buffer<T, buffer_dim>> MOutBufPtr;
 };
 
+/// These are the forward declaration for the classes that help to create
+/// names for additional kernels. It is used only when there are
+/// more then 1 kernels in one parallel_for() implementing SYCL reduction.
+template <typename Type> class __sycl_reduction_main_2nd_kernel;
+template <typename Type> class __sycl_reduction_aux_1st_kernel;
+template <typename Type> class __sycl_reduction_aux_2nd_kernel;
+
+/// Helper structs to get additional kernel name types based on given
+/// \c Name and \c Type types: if \c Name is undefined (is a \c auto_name) then
+/// \c Type becomes the \c Name.
+template <typename Name, typename Type>
+struct get_reduction_main_2nd_kernel_name_t {
+  using name = __sycl_reduction_main_2nd_kernel<
+      typename sycl::detail::get_kernel_name_t<Name, Type>::name>;
+};
+template <typename Name, typename Type>
+struct get_reduction_aux_1st_kernel_name_t {
+  using name = __sycl_reduction_aux_1st_kernel<
+      typename sycl::detail::get_kernel_name_t<Name, Type>::name>;
+};
+template <typename Name, typename Type>
+struct get_reduction_aux_2nd_kernel_name_t {
+  using name = __sycl_reduction_aux_2nd_kernel<
+      typename sycl::detail::get_kernel_name_t<Name, Type>::name>;
+};
+
+/// Implements a command group function that enqueues a kernel that calls
+/// user's lambda function \param KernelFunc and does one iteration of
+/// reduction of elements in each of work-groups.
+/// This version uses tree-reduction algorithm to reduce elements in each
+/// of work-groups. At the end of each work-group the partial sum is written
+/// to a global buffer.
+///
+/// Briefly: user's lambda, tree-reduction, CUSTOM types/ops.
+template <typename KernelName, typename KernelType, int Dims, class Reduction>
+void reduCGFunc(handler &CGH, KernelType KernelFunc,
+                const nd_range<Dims> &Range, Reduction &Redu) {
+
+  size_t NWorkItems = Range.get_global_range().size();
+  size_t WGSize = Range.get_local_range().size();
+  size_t NWorkGroups = Range.get_group_range().size();
+
+  // The last work-group may be not fully loaded with work, or the work group
+  // size may be not power of two. Those two cases considered inefficient
+  // as they require additional code and checks in the kernel.
+  bool HasNonUniformWG = (NWorkGroups * WGSize - NWorkItems) != 0;
+  bool IsEfficientCase = !HasNonUniformWG && ((WGSize & (WGSize - 1)) == 0);
+
+  bool IsUpdateOfUserAcc =
+      Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
+
+  // Use local memory to reduce elements in work-groups into 0-th element.
+  // If WGSize is not power of two, then WGSize+1 elements are allocated.
+  // The additional last element is used to catch elements that could
+  // otherwise be lost in the tree-reduction algorithm.
+  size_t NumLocalElements = WGSize + (IsEfficientCase ? 0 : 1);
+  auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, CGH);
+
+  auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, 0, CGH);
+  auto ReduIdentity = Redu.getIdentity();
+  if (IsEfficientCase) {
+    // Efficient case: work-groups are uniform and WGSize is is power of two.
+    CGH.parallel_for<KernelName>(Range, [=](nd_item<Dims> NDIt) {
+      // Call user's functions. Reducer.MValue gets initialized there.
+      typename Reduction::reducer_type Reducer(ReduIdentity);
+      KernelFunc(NDIt, Reducer);
+
+      // Copy the element to local memory to prepare it for tree-reduction.
+      size_t LID = NDIt.get_local_linear_id();
+      LocalReds[LID] = Reducer.MValue;
+      NDIt.barrier();
+
+      // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0].
+      typename Reduction::binary_operation BOp;
+      size_t WGSize = NDIt.get_local_range().size();
+      for (size_t CurStep = WGSize >> 1; CurStep > 0; CurStep >>= 1) {
+        if (LID < CurStep)
+          LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
+        NDIt.barrier();
+      }
+
+      // Compute the partial sum/reduction for the work-group.
+      if (LID == 0)
+        Out.get_pointer().get()[NDIt.get_group_linear_id()] =
+            IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), LocalReds[0])
+                              : LocalReds[0];
+    });
+  } else {
+    // Inefficient case: work-groups are non uniform or WGSize is not power
+    // of two, which requires more conditional, read and write operations.
+    // These two inefficient cases are handled by one kernel, which
+    // can be split later into two separate kernels, if there are users who
+    // really need more efficient code for them.
+    using AuxName =
+        typename get_reduction_main_2nd_kernel_name_t<KernelName,
+                                                      KernelType>::name;
+    CGH.parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
+      // Call user's functions. Reducer.MValue gets initialized there.
+      typename Reduction::reducer_type Reducer(ReduIdentity);
+      KernelFunc(NDIt, Reducer);
+
+      size_t WGSize = NDIt.get_local_range().size();
+      size_t LID = NDIt.get_local_linear_id();
+      size_t GID = NDIt.get_global_linear_id();
+      // Copy the element to local memory to prepare it for tree-reduction.
+      LocalReds[LID] = (GID < NWorkItems) ? Reducer.MValue : ReduIdentity;
+      LocalReds[WGSize] = ReduIdentity;
+      NDIt.barrier();
+
+      // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]
+      // LocalReds[WGSize] accumulates last/odd elements when the step
+      // of tree-reduction loop is not even.
+      typename Reduction::binary_operation BOp;
+      size_t PrevStep = WGSize;
+      for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
+        if (LID < CurStep)
+          LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
+        else if (LID == CurStep && (PrevStep & 0x1))
+          LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
+        NDIt.barrier();
+        PrevStep = CurStep;
+      }
+
+      // Compute the partial sum/reduction for the work-group.
+      if (LID == 0) {
+        size_t GrID = NDIt.get_group_linear_id();
+        auto V = BOp(LocalReds[0], LocalReds[WGSize]);
+        Out.get_pointer().get()[GrID] =
+            IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V;
+      }
+    });
+  }
+}
+
+/// Implements a command group function that enqueues a kernel that does one
+/// iteration of reduction of elements in each of work-groups.
+/// This version uses tree-reduction algorithm to reduce elements in each
+/// of work-groups. At the end of each work-group the partial sum is written
+/// to a global buffer.
+///
+/// Briefly: aux kernel, tree-reduction, CUSTOM types/ops.
+template <typename KernelName, typename KernelType, int Dims, class Reduction>
+void reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
+                   size_t KernelRun, Reduction &Redu) {
+  size_t WGSize = Range.get_local_range().size();
+  size_t NWorkGroups = Range.get_group_range().size();
+
+  // The last work-group may be not fully loaded with work, or the work group
+  // size may be not power of two. Those two cases considered inefficient
+  // as they require additional code and checks in the kernel.
+  bool HasNonUniformWG = NWorkGroups * WGSize != NWorkItems;
+  bool IsEfficientCase = !HasNonUniformWG && (WGSize & (WGSize - 1)) == 0;
+
+  bool IsUpdateOfUserAcc =
+      Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
+
+  // Use local memory to reduce elements in work-groups into 0-th element.
+  // If WGSize is not power of two, then WGSize+1 elements are allocated.
+  // The additional last element is used to catch elements that could
+  // otherwise be lost in the tree-reduction algorithm.
+  size_t NumLocalElements = WGSize + (IsEfficientCase ? 0 : 1);
+  auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, CGH);
+
+  // Get read accessor to the buffer that was used as output
+  // in the previous kernel. After that create new output buffer if needed
+  // and get accessor to it (or use reduction's accessor if the kernel
+  // is the last one).
+  auto In = Redu.getReadAccToPreviousPartialReds(CGH);
+  auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, KernelRun, CGH);
+
+  if (IsEfficientCase) {
+    // Efficient case: work-groups are fully loaded and work-group size
+    // is power of two.
+    using AuxName =
+        typename get_reduction_aux_1st_kernel_name_t<KernelName,
+                                                     KernelType>::name;
+    CGH.parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
+      // Copy the element to local memory to prepare it for tree-reduction.
+      size_t LID = NDIt.get_local_linear_id();
+      size_t GID = NDIt.get_global_linear_id();
+      LocalReds[LID] = In[GID];
+      NDIt.barrier();
+
+      // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]
+      typename Reduction::binary_operation BOp;
+      size_t WGSize = NDIt.get_local_range().size();
+      for (size_t CurStep = WGSize >> 1; CurStep > 0; CurStep >>= 1) {
+        if (LID < CurStep)
+          LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
+        NDIt.barrier();
+      }
+
+      // Compute the partial sum/reduction for the work-group.
+      if (LID == 0)
+        Out.get_pointer().get()[NDIt.get_group_linear_id()] =
+            IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), LocalReds[0])
+                              : LocalReds[0];
+    });
+  } else {
+    // Inefficient case: work-groups are not fully loaded
+    // or WGSize is not power of two.
+    // These two inefficient cases are handled by one kernel, which
+    // can be split later into two separate kernels, if there are users
+    // who really need more efficient code for them.
+    using AuxName =
+        typename get_reduction_aux_2nd_kernel_name_t<KernelName,
+                                                     KernelType>::name;
+    auto ReduIdentity = Redu.getIdentity();
+    CGH.parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
+      size_t WGSize = NDIt.get_local_range().size();
+      size_t LID = NDIt.get_local_linear_id();
+      size_t GID = NDIt.get_global_linear_id();
+      // Copy the element to local memory to prepare it for tree-reduction
+      LocalReds[LID] = (GID < NWorkItems) ? In[GID] : ReduIdentity;
+      LocalReds[WGSize] = ReduIdentity;
+      NDIt.barrier();
+
+      // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]
+      // LocalReds[WGSize] accumulates last/odd elements when the step
+      // of tree-reduction loop is not even.
+      typename Reduction::binary_operation BOp;
+      size_t PrevStep = WGSize;
+      for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
+        if (LID < CurStep)
+          LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
+        else if (LID == CurStep && (PrevStep & 0x1))
+          LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
+        NDIt.barrier();
+        PrevStep = CurStep;
+      }
+
+      // Compute the partial sum/reduction for the work-group.
+      if (LID == 0) {
+        size_t GrID = NDIt.get_group_linear_id();
+        auto V = BOp(LocalReds[0], LocalReds[WGSize]);
+        Out.get_pointer().get()[GrID] =
+            IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V;
+      }
+    });
+  }
+}
+
 } // namespace detail
 
 /// Creates and returns an object implementing the reduction functionality.
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index 9b65ead92fcde..a2dba553923a7 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -351,11 +351,6 @@ class queue_impl {
   /// \return a native handle.
   pi_native_handle getNative() const;
 
-  /// Stores an event that should be associated with the queue
-  ///
-  /// \param Event is the event to be stored
-  void addEvent(event Event);
-
 private:
   /// Performs command group submission to the queue.
   ///
@@ -388,6 +383,11 @@ class queue_impl {
   /// \param Event is the event to be stored
   void addUSMEvent(event Event);
 
+  /// Stores an event that should be associated with the queue
+  ///
+  /// \param Event is the event to be stored
+  void addEvent(event Event);
+
   /// Protects all the fields that can be changed by class' methods.
   mutex_class MMutex;
 
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index 8685b1060dcdc..6d018f6d3b122 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -19,11 +19,6 @@
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 
-void handler::addEventToQueue(shared_ptr_class<detail::queue_impl> Queue,
-                              cl::sycl::event Event) {
-  Queue->addEvent(std::move(Event));
-}
-
 event handler::finalize() {
   // This block of code is needed only for reduction implementation.
   // It is harmless (does nothing) for everything else.
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index 73e9c1ebb4dec..cac45ced92b59 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -3231,7 +3231,6 @@ _ZN2cl4sycl7handler10processArgEPvRKNS0_6detail19kernel_param_kind_tEimRmb
 _ZN2cl4sycl7handler13getKernelNameB5cxx11Ev
 _ZN2cl4sycl7handler18extractArgsAndReqsEv
 _ZN2cl4sycl7handler28extractArgsAndReqsFromLambdaEPcmPKNS0_6detail19kernel_param_desc_tE
-_ZN2cl4sycl7handler15addEventToQueueESt10shared_ptrINS0_6detail10queue_implEENS0_5eventE
 _ZN2cl4sycl7handler8finalizeEv
 _ZN2cl4sycl7program17build_with_sourceENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_
 _ZN2cl4sycl7program19compile_with_sourceENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_