diff --git a/sycl/include/CL/sycl/ONEAPI/reduction.hpp b/sycl/include/CL/sycl/ONEAPI/reduction.hpp
index 26e05b56f4263..0153d8086a20c 100644
--- a/sycl/include/CL/sycl/ONEAPI/reduction.hpp
+++ b/sycl/include/CL/sycl/ONEAPI/reduction.hpp
@@ -8,7 +8,7 @@
 
 #pragma once
 
-#include "CL/sycl/ONEAPI/accessor_property_list.hpp"
+#include <CL/sycl/ONEAPI/accessor_property_list.hpp>
 #include <CL/sycl/ONEAPI/group_algorithm.hpp>
 #include <CL/sycl/accessor.hpp>
 #include <CL/sycl/atomic.hpp>
@@ -250,78 +250,100 @@ class reducer<T, BinaryOperation,
     combine(Partial);
   }
 
+private:
+  template <access::address_space Space>
+  static constexpr memory_scope getMemoryScope() {
+    return Space == access::address_space::local_space
+               ? memory_scope::work_group
+               : memory_scope::device;
+  }
+
+public:
   /// Atomic ADD operation: *ReduVarPtr += MValue;
-  template <typename _T = T, class _BinaryOperation = BinaryOperation>
+  template <access::address_space Space = access::address_space::global_space,
+            typename _T = T, class _BinaryOperation = BinaryOperation>
   enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
-              IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value &&
-              sycl::detail::IsPlus<T, _BinaryOperation>::value>
+              (IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value ||
+               IsReduOptForAtomic64Add<T, _BinaryOperation>::value) &&
+              sycl::detail::IsPlus<T, _BinaryOperation>::value &&
+              (Space == access::address_space::global_space ||
+               Space == access::address_space::local_space)>
   atomic_combine(_T *ReduVarPtr) const {
-    atomic<T, access::address_space::global_space>(global_ptr<T>(ReduVarPtr))
+    atomic_ref<T, memory_order::relaxed, getMemoryScope<Space>(), Space>(
+        *multi_ptr<T, Space>(ReduVarPtr))
         .fetch_add(MValue);
   }
 
   /// Atomic BITWISE OR operation: *ReduVarPtr |= MValue;
-  template <typename _T = T, class _BinaryOperation = BinaryOperation>
+  template <access::address_space Space = access::address_space::global_space,
+            typename _T = T, class _BinaryOperation = BinaryOperation>
   enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
               IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value &&
-              sycl::detail::IsBitOR<T, _BinaryOperation>::value>
+              sycl::detail::IsBitOR<T, _BinaryOperation>::value &&
+              (Space == access::address_space::global_space ||
+               Space == access::address_space::local_space)>
   atomic_combine(_T *ReduVarPtr) const {
-    atomic<T, access::address_space::global_space>(global_ptr<T>(ReduVarPtr))
+    atomic_ref<T, memory_order::relaxed, getMemoryScope<Space>(), Space>(
+        *multi_ptr<T, Space>(ReduVarPtr))
         .fetch_or(MValue);
   }
 
   /// Atomic BITWISE XOR operation: *ReduVarPtr ^= MValue;
-  template <typename _T = T, class _BinaryOperation = BinaryOperation>
+  template <access::address_space Space = access::address_space::global_space,
+            typename _T = T, class _BinaryOperation = BinaryOperation>
   enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
               IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value &&
-              sycl::detail::IsBitXOR<T, _BinaryOperation>::value>
+              sycl::detail::IsBitXOR<T, _BinaryOperation>::value &&
+              (Space == access::address_space::global_space ||
+               Space == access::address_space::local_space)>
   atomic_combine(_T *ReduVarPtr) const {
-    atomic<T, access::address_space::global_space>(global_ptr<T>(ReduVarPtr))
+    atomic_ref<T, memory_order::relaxed, getMemoryScope<Space>(), Space>(
+        *multi_ptr<T, Space>(ReduVarPtr))
         .fetch_xor(MValue);
   }
 
   /// Atomic BITWISE AND operation: *ReduVarPtr &= MValue;
-  template <typename _T = T, class _BinaryOperation = BinaryOperation>
+  template <access::address_space Space = access::address_space::global_space,
+            typename _T = T, class _BinaryOperation = BinaryOperation>
   enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
               IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value &&
-              sycl::detail::IsBitAND<T, _BinaryOperation>::value>
+              sycl::detail::IsBitAND<T, _BinaryOperation>::value &&
+              (Space == access::address_space::global_space ||
+               Space == access::address_space::local_space)>
   atomic_combine(_T *ReduVarPtr) const {
-    atomic<T, access::address_space::global_space>(global_ptr<T>(ReduVarPtr))
+    atomic_ref<T, memory_order::relaxed, getMemoryScope<Space>(), Space>(
+        *multi_ptr<T, Space>(ReduVarPtr))
         .fetch_and(MValue);
   }
 
   /// Atomic MIN operation: *ReduVarPtr = ONEAPI::minimum(*ReduVarPtr, MValue);
-  template <typename _T = T, class _BinaryOperation = BinaryOperation>
+  template <access::address_space Space = access::address_space::global_space,
+            typename _T = T, class _BinaryOperation = BinaryOperation>
   enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
               IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value &&
-              sycl::detail::IsMinimum<T, _BinaryOperation>::value>
+              sycl::detail::IsMinimum<T, _BinaryOperation>::value &&
+              (Space == access::address_space::global_space ||
+               Space == access::address_space::local_space)>
   atomic_combine(_T *ReduVarPtr) const {
-    atomic<T, access::address_space::global_space>(global_ptr<T>(ReduVarPtr))
+    atomic_ref<T, memory_order::relaxed, getMemoryScope<Space>(), Space>(
+        *multi_ptr<T, Space>(ReduVarPtr))
         .fetch_min(MValue);
   }
 
   /// Atomic MAX operation: *ReduVarPtr = ONEAPI::maximum(*ReduVarPtr, MValue);
-  template <typename _T = T, class _BinaryOperation = BinaryOperation>
+  template <access::address_space Space = access::address_space::global_space,
+            typename _T = T, class _BinaryOperation = BinaryOperation>
   enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
               IsReduOptForFastAtomicFetch<T, _BinaryOperation>::value &&
-              sycl::detail::IsMaximum<T, _BinaryOperation>::value>
+              sycl::detail::IsMaximum<T, _BinaryOperation>::value &&
+              (Space == access::address_space::global_space ||
+               Space == access::address_space::local_space)>
   atomic_combine(_T *ReduVarPtr) const {
-    atomic<T, access::address_space::global_space>(global_ptr<T>(ReduVarPtr))
+    atomic_ref<T, memory_order::relaxed, getMemoryScope<Space>(), Space>(
+        *multi_ptr<T, Space>(ReduVarPtr))
         .fetch_max(MValue);
   }
 
-  /// Atomic ADD operation: for floating point using atomic_ref
-  template <typename _T = T, class _BinaryOperation = BinaryOperation>
-  enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
-              IsReduOptForAtomic64Add<T, _BinaryOperation>::value>
-  atomic_combine(_T *ReduVarPtr) const {
-
-    atomic_ref<T, sycl::ONEAPI::memory_order::relaxed,
-               sycl::ONEAPI::memory_scope::device,
-               access::address_space::global_space>(
-        *global_ptr<T>(ReduVarPtr)) += MValue;
-  }
-
   T MValue;
 };
 
@@ -361,8 +383,6 @@ class reduction_impl : private reduction_impl_base {
                ONEAPI::accessor_property_list<>>;
   static constexpr int accessor_dim = Dims;
   static constexpr int buffer_dim = (Dims == 0) ? 1 : Dims;
-  using local_accessor_type =
-      accessor<T, buffer_dim, access::mode::read_write, access::target::local>;
 
   static constexpr bool has_atomic_add_float64 =
       IsReduOptForAtomic64Add<T, BinaryOperation>::value;
@@ -612,8 +632,15 @@ class reduction_impl : private reduction_impl_base {
 #endif
   }
 
-  static local_accessor_type getReadWriteLocalAcc(size_t Size, handler &CGH) {
-    return local_accessor_type(Size, CGH);
+  /// Creates and returns a local accessor with the \p Size elements.
+  /// By default the local accessor elements are of the same type as the
+  /// elements processed by the reduction, but may it be altered by specifying
+  /// \p _T explicitly if need an accessor with elements of different type.
+  template <typename _T = result_type>
+  static accessor<_T, buffer_dim, access::mode::read_write,
+                  access::target::local>
+  getReadWriteLocalAcc(size_t Size, handler &CGH) {
+    return {Size, CGH};
   }
 
   accessor<T, buffer_dim, access::mode::read>
@@ -686,6 +713,16 @@ class reduction_impl : private reduction_impl_base {
     return createHandlerWiredReadWriteAccessor(CGH, *MOutBufPtr);
   }
 
+  accessor<int, 1, access::mode::read_write, access::target::global_buffer,
+           access::placeholder::false_t>
+  getReadWriteAccessorToInitializedGroupsCounter(handler &CGH) {
+    auto CounterMem = std::make_shared<int>(0);
+    CGH.addReduction(CounterMem);
+    auto CounterBuf = std::make_shared<buffer<int, 1>>(CounterMem.get(), 1);
+    CGH.addReduction(CounterBuf);
+    return {*CounterBuf, CGH};
+  }
+
   bool hasUserDiscardWriteAccessor() { return MDWAcc != nullptr; }
 
   template <bool _IsUSM = IsUSM>
@@ -780,6 +817,219 @@ struct get_reduction_aux_kernel_name_t<sycl::detail::auto_name, Type, B1, B2,
   using name = sycl::detail::auto_name;
 };
 
+/// Called in device code. This function iterates through the index space
+/// \p Range using stride equal to the global range specified in \p NdId,
+/// which gives much better performance than using stride equal to 1.
+/// For each of the index the given \p F function/functor is called and
+/// the reduction value hold in \p Reducer is accumulated in those calls.
+template <typename KernelFunc, int Dims, typename ReducerT>
+void reductionLoop(const range<Dims> &Range, ReducerT &Reducer,
+                   const nd_item<1> &NdId, KernelFunc &F) {
+  size_t Start = NdId.get_global_id(0);
+  size_t End = Range.size();
+  size_t Stride = NdId.get_global_range(0);
+  for (size_t I = Start; I < End; I += Stride)
+    F(sycl::detail::getDelinearizedId(Range, I), Reducer);
+}
+
+template <typename KernelName, typename KernelType, int Dims, class Reduction>
+std::enable_if_t<Reduction::has_fast_atomics>
+reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const range<Dims> &Range,
+               const nd_range<1> &NDRange, Reduction &Redu) {
+  auto Out = Redu.getReadWriteAccessorToInitializedMem(CGH);
+  auto GroupSum = Reduction::getReadWriteLocalAcc(1, CGH);
+  using Name =
+      typename get_reduction_main_kernel_name_t<KernelName, KernelType,
+                                                Reduction::is_usm, false>::name;
+  CGH.parallel_for<Name>(NDRange, [=](nd_item<1> NDId) {
+    // Call user's functions. Reducer.MValue gets initialized there.
+    typename Reduction::reducer_type Reducer;
+    reductionLoop(Range, Reducer, NDId, KernelFunc);
+
+    auto LID = NDId.get_local_id(0);
+    if (LID == 0)
+      GroupSum[0] = Reducer.getIdentity();
+    sycl::detail::workGroupBarrier();
+    Reducer.template atomic_combine<access::address_space::local_space>(
+        &GroupSum[0]);
+
+    sycl::detail::workGroupBarrier();
+    if (LID == 0) {
+      Reducer.MValue = GroupSum[0];
+      Reducer.template atomic_combine(Reduction::getOutPointer(Out));
+    }
+  });
+}
+
+template <typename KernelName, typename KernelType, int Dims, class Reduction>
+std::enable_if_t<!Reduction::has_fast_atomics && Reduction::has_fast_reduce>
+reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const range<Dims> &Range,
+               const nd_range<1> &NDRange, Reduction &Redu) {
+  size_t WGSize = NDRange.get_local_range().size();
+  size_t NWorkGroups = NDRange.get_group_range().size();
+
+  bool IsUpdateOfUserVar = !Reduction::is_usm && !Redu.initializeToIdentity();
+  auto PartialSums = Redu.getWriteAccForPartialReds(NWorkGroups, CGH);
+  auto Out =
+      (NWorkGroups == 1) ? PartialSums : Redu.getWriteAccForPartialReds(1, CGH);
+  auto NWorkGroupsFinished =
+      Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH);
+  auto DoReducePartialSumsInLastWG =
+      Reduction::template getReadWriteLocalAcc<int>(1, CGH);
+
+  using Name =
+      typename get_reduction_main_kernel_name_t<KernelName, KernelType,
+                                                Reduction::is_usm, false>::name;
+  CGH.parallel_for<Name>(NDRange, [=](nd_item<1> NDId) {
+    // Call user's functions. Reducer.MValue gets initialized there.
+    typename Reduction::reducer_type Reducer;
+    reductionLoop(Range, Reducer, NDId, KernelFunc);
+
+    typename Reduction::binary_operation BOp;
+    auto Group = NDId.get_group();
+    Reducer.MValue = reduce_over_group(Group, Reducer.MValue, BOp);
+
+    size_t LID = NDId.get_local_id(0);
+    if (LID == 0) {
+      if (NWorkGroups == 1 && IsUpdateOfUserVar)
+        Reducer.MValue = BOp(Reducer.MValue, *Reduction::getOutPointer(Out));
+      // if NWorkGroups == 1, then PartialsSum and Out point to same memory.
+      Reduction::getOutPointer(PartialSums)[NDId.get_group_linear_id()] =
+          Reducer.MValue;
+
+      auto NFinished =
+          atomic_ref<int, memory_order::relaxed, memory_scope::device,
+                     access::address_space::global_space>(
+              NWorkGroupsFinished[0]);
+      DoReducePartialSumsInLastWG[0] =
+          ++NFinished == NWorkGroups && NWorkGroups > 1;
+    }
+
+    sycl::detail::workGroupBarrier();
+    if (DoReducePartialSumsInLastWG[0]) {
+      auto LocalSum = Reducer.getIdentity();
+      for (size_t I = LID; I < NWorkGroups; I += WGSize)
+        LocalSum = BOp(LocalSum, PartialSums[I]);
+      Reducer.MValue = reduce_over_group(Group, LocalSum, BOp);
+
+      if (LID == 0) {
+        if (IsUpdateOfUserVar)
+          Reducer.MValue = BOp(Reducer.MValue, *Reduction::getOutPointer(Out));
+        Reduction::getOutPointer(Out)[0] = Reducer.MValue;
+      }
+    }
+  });
+}
+
+template <typename KernelName, typename KernelType, int Dims, class Reduction>
+std::enable_if_t<!Reduction::has_fast_atomics && !Reduction::has_fast_reduce>
+reduCGFuncImpl(handler &CGH, KernelType KernelFunc, const range<Dims> &Range,
+               const nd_range<1> &NDRange, Reduction &Redu) {
+  size_t WGSize = NDRange.get_local_range().size();
+  size_t NWorkGroups = NDRange.get_group_range().size();
+
+  bool IsUpdateOfUserVar = !Reduction::is_usm && !Redu.initializeToIdentity();
+  auto PartialSums = Redu.getWriteAccForPartialReds(NWorkGroups, CGH);
+  auto Out =
+      (NWorkGroups == 1) ? PartialSums : Redu.getWriteAccForPartialReds(1, CGH);
+  auto LocalReds = Reduction::getReadWriteLocalAcc(WGSize + 1, CGH);
+  auto NWorkGroupsFinished =
+      Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH);
+  auto DoReducePartialSumsInLastWG =
+      Reduction::template getReadWriteLocalAcc<int>(1, CGH);
+
+  auto Identity = Redu.getIdentity();
+  auto BOp = Redu.getBinaryOperation();
+  using Name =
+      typename get_reduction_main_kernel_name_t<KernelName, KernelType,
+                                                Reduction::is_usm, false>::name;
+  CGH.parallel_for<Name>(NDRange, [=](nd_item<1> NDId) {
+    // Call user's functions. Reducer.MValue gets initialized there.
+    typename Reduction::reducer_type Reducer(Identity, BOp);
+    reductionLoop(Range, Reducer, NDId, KernelFunc);
+
+    // Copy the element to local memory to prepare it for tree-reduction.
+    size_t LID = NDId.get_local_linear_id();
+    LocalReds[LID] = Reducer.MValue;
+    if (LID == 0)
+      LocalReds[WGSize] = Identity;
+    sycl::detail::workGroupBarrier();
+
+    // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0].
+    // LocalReds[WGSize] accumulates last/odd elements when the step
+    // of tree-reduction loop is not even.
+    size_t PrevStep = WGSize;
+    for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
+      if (LID < CurStep)
+        LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
+      else if (LID == CurStep && (PrevStep & 0x1))
+        LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
+      sycl::detail::workGroupBarrier();
+      PrevStep = CurStep;
+    }
+
+    if (LID == 0) {
+      auto V = BOp(LocalReds[0], LocalReds[WGSize]);
+      if (NWorkGroups == 1 && IsUpdateOfUserVar)
+        V = BOp(V, *Reduction::getOutPointer(Out));
+      // if NWorkGroups == 1, then PartialsSum and Out point to same memory.
+      Reduction::getOutPointer(PartialSums)[NDId.get_group_linear_id()] = V;
+
+      auto NFinished =
+          atomic_ref<int, memory_order::relaxed, memory_scope::device,
+                     access::address_space::global_space>(
+              NWorkGroupsFinished[0]);
+      DoReducePartialSumsInLastWG[0] =
+          ++NFinished == NWorkGroups && NWorkGroups > 1;
+    }
+
+    sycl::detail::workGroupBarrier();
+    if (DoReducePartialSumsInLastWG[0]) {
+      auto LocalSum = Identity;
+      for (size_t I = LID; I < NWorkGroups; I += WGSize)
+        LocalSum = BOp(LocalSum, Reduction::getOutPointer(PartialSums)[I]);
+
+      LocalReds[LID] = LocalSum;
+      if (LID == 0)
+        LocalReds[WGSize] = Identity;
+      sycl::detail::workGroupBarrier();
+
+      size_t PrevStep = WGSize;
+      for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
+        if (LID < CurStep)
+          LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
+        else if (LID == CurStep && (PrevStep & 0x1))
+          LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
+        sycl::detail::workGroupBarrier();
+        PrevStep = CurStep;
+      }
+      if (LID == 0) {
+        auto V = BOp(LocalReds[0], LocalReds[WGSize]);
+        if (IsUpdateOfUserVar)
+          V = BOp(V, *Reduction::getOutPointer(Out));
+        Reduction::getOutPointer(Out)[0] = V;
+      }
+    }
+  });
+}
+
+template <typename KernelName, typename KernelType, int Dims, class Reduction>
+void reduCGFunc(handler &CGH, KernelType KernelFunc, const range<Dims> &Range,
+                size_t MaxWGSize, uint32_t NumConcurrentWorkGroups,
+                Reduction &Redu) {
+  size_t NWorkItems = Range.size();
+  size_t WGSize = std::min(NWorkItems, MaxWGSize);
+  size_t NWorkGroups = NWorkItems / WGSize;
+  if (NWorkItems % WGSize)
+    NWorkGroups++;
+  size_t MaxNWorkGroups = NumConcurrentWorkGroups;
+  NWorkGroups = std::min(NWorkGroups, MaxNWorkGroups);
+  size_t NDRItems = NWorkGroups * WGSize;
+  nd_range<1> NDRange{range<1>{NDRItems}, range<1>{WGSize}};
+
+  reduCGFuncImpl<KernelName>(CGH, KernelFunc, Range, NDRange, Redu);
+}
+
 /// Implements a command group function that enqueues a kernel that calls
 /// user's lambda function KernelFunc and also does one iteration of reduction
 /// of elements computed in user's lambda function.
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index 1fd0bfe122453..001cdeec3bdfd 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -209,6 +209,11 @@ class reduction_impl;
 using cl::sycl::detail::enable_if_t;
 using cl::sycl::detail::queue_impl;
 
+template <typename KernelName, typename KernelType, int Dims, class Reduction>
+void reduCGFunc(handler &CGH, KernelType KernelFunc, const range<Dims> &Range,
+                size_t MaxWGSize, uint32_t NumConcurrentWorkGroups,
+                Reduction &Redu);
+
 template <typename KernelName, typename KernelType, int Dims, class Reduction>
 enable_if_t<Reduction::has_atomic_add_float64>
 reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
@@ -262,6 +267,9 @@ reduSaveFinalResultToUserMemHelper(std::vector<event> &Events,
                                    std::shared_ptr<detail::queue_impl> Queue,
                                    bool IsHost, Reduction &Redu, RestT... Rest);
 
+__SYCL_EXPORT uint32_t
+reduGetMaxNumConcurrentWorkGroups(std::shared_ptr<queue_impl> Queue);
+
 __SYCL_EXPORT size_t reduGetMaxWGSize(std::shared_ptr<queue_impl> Queue,
                                       size_t LocalMemBytesPerWorkItem);
 
@@ -501,23 +509,6 @@ class __SYCL_EXPORT handler {
     }
   }
 
-  static id<1> getDelinearizedIndex(const range<1>, const size_t Index) {
-    return {Index};
-  }
-
-  static id<2> getDelinearizedIndex(const range<2> Range, const size_t Index) {
-    size_t x = Index % Range[1];
-    size_t y = Index / Range[1];
-    return {y, x};
-  }
-
-  static id<3> getDelinearizedIndex(const range<3> Range, const size_t Index) {
-    size_t z = Index / (Range[1] * Range[2]);
-    size_t y = (Index / Range[2]) % Range[1];
-    size_t x = Index % Range[2];
-    return {z, y, x};
-  }
-
   /// Stores lambda to the template-free object
   ///
   /// Also initializes kernel name, list of arguments and requirements using
@@ -593,9 +584,9 @@ class __SYCL_EXPORT handler {
                                      IsPHSrc, IsPHDst>>
                                      (LinearizedRange, [=](id<1> Id) {
       size_t Index = Id[0];
-      id<DimSrc> SrcIndex = getDelinearizedIndex(Src.get_range(), Index);
-      id<DimDst> DstIndex = getDelinearizedIndex(Dst.get_range(), Index);
-      Dst[DstIndex] = Src[SrcIndex];
+      id<DimSrc> SrcId = detail::getDelinearizedId(Src.get_range(), Index);
+      id<DimDst> DstId = detail::getDelinearizedId(Dst.get_range(), Index);
+      Dst[DstId] = Src[SrcId];
     });
     return true;
   }
@@ -1360,6 +1351,50 @@ class __SYCL_EXPORT handler {
 #endif
   }
 
+  /// Defines and invokes a SYCL kernel function for the specified nd_range.
+  ///
+  /// The SYCL kernel function is defined as a lambda function or a named
+  /// function object type and given an id for indexing in the indexing
+  /// space defined by range \p Range.
+  /// The parameter \p Redu contains the object creted by the reduction()
+  /// function and defines the type and operation used in the corresponding
+  /// argument of 'reducer' type passed to lambda/functor function.
+  template <typename KernelName = detail::auto_name, typename KernelType,
+            int Dims, typename Reduction>
+  void parallel_for(range<Dims> Range, Reduction Redu,
+                    _KERNELFUNCPARAM(KernelFunc)) {
+    shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
+
+    // Before running the kernels, check that device has enough local memory
+    // to hold local arrays required for the tree-reduction algorithm.
+    constexpr bool IsTreeReduction =
+        !Reduction::has_fast_reduce && !Reduction::has_fast_atomics;
+    size_t OneElemSize =
+        IsTreeReduction ? sizeof(typename Reduction::result_type) : 0;
+    uint32_t NumConcurrentWorkGroups =
+#ifdef __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS
+        __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS;
+#else
+        ONEAPI::detail::reduGetMaxNumConcurrentWorkGroups(MQueue);
+#endif
+    // TODO: currently the maximal work group size is determined for the given
+    // queue/device, while it is safer to use queries to the kernel pre-compiled
+    // for the device.
+    size_t MaxWGSize = ONEAPI::detail::reduGetMaxWGSize(MQueue, OneElemSize);
+    ONEAPI::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, MaxWGSize,
+                                           NumConcurrentWorkGroups, Redu);
+    if (Reduction::is_usm ||
+        (Reduction::has_fast_atomics && Redu.initializeToIdentity()) ||
+        (!Reduction::has_fast_atomics && Redu.hasUserDiscardWriteAccessor())) {
+      this->finalize();
+      handler CopyHandler(QueueCopy, MIsHost);
+      CopyHandler.saveCodeLoc(MCodeLoc);
+      ONEAPI::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
+                                                               Redu);
+      MLastEvent = CopyHandler.finalize();
+    }
+  }
+
   /// Implements parallel_for() accepting nd_range \p Range and one reduction
   /// object. This version uses fast sycl::atomic operations to update reduction
   /// variable at the end of each work-group work.
diff --git a/sycl/include/CL/sycl/id.hpp b/sycl/include/CL/sycl/id.hpp
index 151657aa661e8..8184d661e5551 100644
--- a/sycl/include/CL/sycl/id.hpp
+++ b/sycl/include/CL/sycl/id.hpp
@@ -254,6 +254,25 @@ size_t getOffsetForId(range<dimensions> Range, id<dimensions> Id,
     offset = offset * Range[i] + Offset[i] + Id[i];
   return offset;
 }
+
+inline id<1> getDelinearizedId(const range<1> &, size_t Index) {
+  return {Index};
+}
+
+inline id<2> getDelinearizedId(const range<2> &Range, size_t Index) {
+  size_t X = Index % Range[1];
+  size_t Y = Index / Range[1];
+  return {Y, X};
+}
+
+inline id<3> getDelinearizedId(const range<3> &Range, size_t Index) {
+  size_t D1D2 = Range[1] * Range[2];
+  size_t Z = Index / D1D2;
+  size_t ZRest = Index % D1D2;
+  size_t Y = ZRest / Range[2];
+  size_t X = ZRest % Range[2];
+  return {Z, Y, X};
+}
 } // namespace detail
 
 // C++ feature test macros are supported by all supported compilers
diff --git a/sycl/source/detail/reduction.cpp b/sycl/source/detail/reduction.cpp
index d603d80125995..bb1d19441f9a4 100644
--- a/sycl/source/detail/reduction.cpp
+++ b/sycl/source/detail/reduction.cpp
@@ -48,6 +48,20 @@ __SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
   return WGSize;
 }
 
+// Returns the estimated number of physical threads on the device associated
+// with the given queue.
+__SYCL_EXPORT uint32_t reduGetMaxNumConcurrentWorkGroups(
+    std::shared_ptr<sycl::detail::queue_impl> Queue) {
+  device Dev = Queue->get_device();
+  uint32_t NumThreads = Dev.get_info<info::device::max_compute_units>();
+  // The heuristics require additional tuning for various devices and vendors.
+  // For now assuming that each of execution units have about 8 working threads
+  // gives good results on some known/supported GPU devices.
+  if (Dev.is_gpu())
+    NumThreads *= 8;
+  return NumThreads;
+}
+
 __SYCL_EXPORT size_t
 reduGetMaxWGSize(std::shared_ptr<sycl::detail::queue_impl> Queue,
                  size_t LocalMemBytesPerWorkItem) {
diff --git a/sycl/test/abi/layout_array.cpp b/sycl/test/abi/layout_array.cpp
index 22e2c893ae9fb..f28d3af04fb30 100644
--- a/sycl/test/abi/layout_array.cpp
+++ b/sycl/test/abi/layout_array.cpp
@@ -9,9 +9,9 @@
 #include <CL/sycl.hpp>
 
 
-SYCL_EXTERNAL void range(sycl::range<2>) {}
+SYCL_EXTERNAL void id(sycl::id<2>) {}
 
-// CHECK: 0 | class sycl::range<2>
+// CHECK: 0 | class sycl::id<2>
 // CHECK-NEXT: 0 |   class sycl::detail::array<2> (base)
 // CHECK-NEXT: 0 |     size_t [2] common_array
 // CHECK-NEXT: | [sizeof=16, dsize=16, align=8,
@@ -19,9 +19,9 @@ SYCL_EXTERNAL void range(sycl::range<2>) {}
 
 //----------------------------
 
-SYCL_EXTERNAL void id(sycl::id<2>) {}
+SYCL_EXTERNAL void range(sycl::range<2>) {}
 
-// CHECK: 0 | class sycl::id<2>
+// CHECK: 0 | class sycl::range<2>
 // CHECK-NEXT: 0 |   class sycl::detail::array<2> (base)
 // CHECK-NEXT: 0 |     size_t [2] common_array
 // CHECK-NEXT: | [sizeof=16, dsize=16, align=8,
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index 0683df8dcc193..bb37bb2c1af37 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -3674,6 +3674,7 @@ _ZN2cl4sycl5queueC2ERKNS0_7contextERKNS0_6deviceERKNS0_13property_listE
 _ZN2cl4sycl5queueC2ERKNS0_7contextERKNS0_6deviceERKSt8functionIFvNS0_14exception_listEEERKNS0_13property_listE
 _ZN2cl4sycl6ONEAPI15filter_selectorC1ERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE
 _ZN2cl4sycl6ONEAPI15filter_selectorC2ERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE
+_ZN2cl4sycl6ONEAPI6detail33reduGetMaxNumConcurrentWorkGroupsESt10shared_ptrINS0_6detail10queue_implEE
 _ZN2cl4sycl6ONEAPI6detail16reduGetMaxWGSizeESt10shared_ptrINS0_6detail10queue_implEEm
 _ZN2cl4sycl6ONEAPI6detail17reduComputeWGSizeEmmRm
 _ZN2cl4sycl6detail10build_implERKNS0_13kernel_bundleILNS0_12bundle_stateE0EEERKSt6vectorINS0_6deviceESaIS8_EERKNS0_13property_listE