diff --git a/sycl/include/CL/sycl/builtins.hpp b/sycl/include/CL/sycl/builtins.hpp
index 400fd080cef1f..784dcfbac20df 100644
--- a/sycl/include/CL/sycl/builtins.hpp
+++ b/sycl/include/CL/sycl/builtins.hpp
@@ -24,11 +24,7 @@ namespace sycl {
 #else
 namespace __sycl_std = __host_std;
 #endif
-} // namespace sycl
-} // __SYCL_INLINE_NAMESPACE(cl)
 
-__SYCL_INLINE_NAMESPACE(cl) {
-namespace sycl {
 /* ----------------- 4.13.3 Math functions. ---------------------------------*/
 // genfloat acos (genfloat x)
 template <typename T>
@@ -731,7 +727,8 @@ detail::enable_if_t<detail::is_geninteger<T>::value, T> clz(T x) __NOEXC {
 namespace intel {
 // geninteger ctz (geninteger x)
 template <typename T>
-detail::enable_if_t<detail::is_geninteger<T>::value, T> ctz(T x) __NOEXC {
+sycl::detail::enable_if_t<sycl::detail::is_geninteger<T>::value, T>
+ctz(T x) __NOEXC {
   return __sycl_std::__invoke_ctz<T>(x);
 }
 } // namespace intel
diff --git a/sycl/include/CL/sycl/detail/cg.hpp b/sycl/include/CL/sycl/detail/cg.hpp
index fa1b992624644..100c29e8ebe38 100644
--- a/sycl/include/CL/sycl/detail/cg.hpp
+++ b/sycl/include/CL/sycl/detail/cg.hpp
@@ -46,7 +46,7 @@ class interop_handler {
 
 public:
   using QueueImplPtr = std::shared_ptr<detail::queue_impl>;
-  using ReqToMem = std::pair<detail::Requirement*, pi_mem>;
+  using ReqToMem = std::pair<detail::Requirement *, pi_mem>;
 
   interop_handler(std::vector<ReqToMem> MemObjs, QueueImplPtr Queue)
       : MQueue(std::move(Queue)), MMemObjs(std::move(MemObjs)) {}
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index dc3c49745b43f..941e878ebb591 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -106,8 +106,45 @@ template <typename Type> struct get_kernel_name_t<detail::auto_name, Type> {
 };
 
 __SYCL_EXPORT device getDeviceFromHandler(handler &);
+
+/// These are the forward declaration for the classes that help to create
+/// names for additional kernels. It is used only when there are
+/// more then 1 kernels in one parallel_for() implementing SYCL reduction.
+template <typename Type> class __sycl_reduction_main_2nd_kernel;
+template <typename Type> class __sycl_reduction_aux_1st_kernel;
+template <typename Type> class __sycl_reduction_aux_2nd_kernel;
+
+/// Helper structs to get additional kernel name types based on given
+/// \c Name and \c Type types: if \c Name is undefined (is a \c auto_name) then
+/// \c Type becomes the \c Name.
+template <typename Name, typename Type>
+struct get_reduction_main_2nd_kernel_name_t {
+  using name = __sycl_reduction_main_2nd_kernel<
+      typename get_kernel_name_t<Name, Type>::name>;
+};
+template <typename Name, typename Type>
+struct get_reduction_aux_1st_kernel_name_t {
+  using name = __sycl_reduction_aux_1st_kernel<
+      typename get_kernel_name_t<Name, Type>::name>;
+};
+template <typename Name, typename Type>
+struct get_reduction_aux_2nd_kernel_name_t {
+  using name = __sycl_reduction_aux_2nd_kernel<
+      typename get_kernel_name_t<Name, Type>::name>;
+};
+
+device getDeviceFromHandler(handler &);
+
 } // namespace detail
 
+namespace intel {
+namespace detail {
+template <typename T, class BinaryOperation, int Dims, access::mode AccMode,
+          access::placeholder IsPlaceholder>
+class reduction_impl;
+} // namespace detail
+} // namespace intel
+
 /// 4.8.3 Command group handler class
 ///
 /// Objects of the handler class collect information about command group, such
@@ -190,14 +227,24 @@ class __SYCL_EXPORT handler {
     return LambdaName == KernelName;
   }
 
+  /// Saves the location of user's code passed in \param CodeLoc for future
+  /// usage in finalize() method.
+  void saveCodeLoc(detail::code_location CodeLoc) { MCodeLoc = CodeLoc; }
+
+  /// Stores the given \param Event to the \param Queue.
+  /// Even though MQueue is a field of handler, the method addEvent() of
+  /// queue_impl class cannot be called inside this handler.hpp file
+  /// as queue_impl is incomplete class for handler.
+  static void addEventToQueue(shared_ptr_class<detail::queue_impl> Queue,
+                              cl::sycl::event Event);
+
   /// Constructs CG object of specific type, passes it to Scheduler and
   /// returns sycl::event object representing the command group.
   /// It's expected that the method is the latest method executed before
   /// object destruction.
   ///
-  /// \param Payload contains the code location of user code
   /// \return a SYCL event object representing the command group
-  event finalize(const cl::sycl::detail::code_location &Payload = {});
+  event finalize();
 
   /// Saves streams associated with this handler.
   ///
@@ -208,6 +255,16 @@ class __SYCL_EXPORT handler {
     MStreamStorage.push_back(std::move(Stream));
   }
 
+  /// Saves buffers created by handling reduction feature in handler.
+  /// They are then forwarded to command group and destroyed only after
+  /// the command group finishes the work on device/host.
+  /// The 'MSharedPtrStorage' suits that need.
+  ///
+  /// @param ReduObj is a pointer to object that must be stored.
+  void addReduction(shared_ptr_class<const void> ReduObj) {
+    MSharedPtrStorage.push_back(std::move(ReduObj));
+  }
+
   ~handler() = default;
 
   bool is_host() { return MIsHost; }
@@ -231,6 +288,30 @@ class __SYCL_EXPORT handler {
                                      /*index*/ 0);
   }
 
+  template <typename DataT, int Dims, access::mode AccessMode,
+            access::target AccessTarget>
+  void dissociateWithHandler(accessor<DataT, Dims, AccessMode, AccessTarget,
+                                      access::placeholder::false_t>
+                                 Acc) {
+    detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Acc;
+    detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
+    detail::Requirement *Req = AccImpl.get();
+
+    // Remove accessor from the list of requirements, accessors storage,
+    // and from the list of associated accessors.
+    auto ReqIt = std::find(MRequirements.begin(), MRequirements.end(), Req);
+    auto AccIt = std::find(MAccStorage.begin(), MAccStorage.end(), AccImpl);
+    auto It =
+        std::find_if(MAssociatedAccesors.begin(), MAssociatedAccesors.end(),
+                     [Req](const detail::ArgDesc &D) { return D.MPtr == Req; });
+    assert((ReqIt != MRequirements.end() && AccIt != MAccStorage.end() &&
+            It != MAssociatedAccesors.end()) &&
+           "Cannot dissociate accessor.");
+    MRequirements.erase(ReqIt);
+    MAccStorage.erase(AccIt);
+    MAssociatedAccesors.erase(It);
+  }
+
   // Recursively calls itself until arguments pack is fully processed.
   // The version for regular(standard layout) argument.
   template <typename T, typename... Ts>
@@ -729,6 +810,307 @@ class __SYCL_EXPORT handler {
 #endif
   }
 
+  /// Implements a command group function that enqueues a kernel that calls
+  /// user's lambda function \param KernelFunc and does one iteration of
+  /// reduction of elements in each of work-groups.
+  /// This version uses tree-reduction algorithm to reduce elements in each
+  /// of work-groups. At the end of each work-group the partial sum is written
+  /// to a global buffer.
+  ///
+  /// Briefly: user's lambda, tree-reduction, CUSTOM types/ops.
+  template <typename KernelName, typename KernelType, int Dims, class Reduction>
+  void reduCGFunc(KernelType KernelFunc, const nd_range<Dims> &Range,
+                  Reduction &Redu) {
+
+    size_t NWorkItems = Range.get_global_range().size();
+    size_t WGSize = Range.get_local_range().size();
+    size_t NWorkGroups = Range.get_group_range().size();
+
+    bool IsUnderLoaded = (NWorkGroups * WGSize - NWorkItems) != 0;
+    bool IsEfficientCase = !IsUnderLoaded && ((WGSize & (WGSize - 1)) == 0);
+
+    bool IsUpdateOfUserAcc =
+        Reduction::accessor_mode == access::mode::read_write &&
+        NWorkGroups == 1;
+
+    // Use local memory to reduce elements in work-groups into 0-th element.
+    // If WGSize is not power of two, then WGSize+1 elements are allocated.
+    // The additional last element is used to catch elements that could
+    // otherwise be lost in the tree-reduction algorithm.
+    size_t NumLocalElements = WGSize + (IsEfficientCase ? 0 : 1);
+    auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, *this);
+
+    auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, 0, *this);
+    auto ReduIdentity = Redu.getIdentity();
+    if (IsEfficientCase) {
+      // Efficient case: work-groups are fully loaded and work-group size
+      // is power of two.
+      parallel_for<KernelName>(Range, [=](nd_item<Dims> NDIt) {
+        // Call user's functions. Reducer.MValue gets initialized there.
+        typename Reduction::reducer_type Reducer(ReduIdentity);
+        KernelFunc(NDIt, Reducer);
+
+        // Copy the element to local memory to prepare it for tree-reduction.
+        size_t LID = NDIt.get_local_linear_id();
+        LocalReds[LID] = Reducer.MValue;
+        NDIt.barrier();
+
+        // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0].
+        typename Reduction::binary_operation BOp;
+        size_t WGSize = NDIt.get_local_range().size();
+        for (size_t CurStep = WGSize >> 1; CurStep > 0; CurStep >>= 1) {
+          if (LID < CurStep)
+            LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
+          NDIt.barrier();
+        }
+
+        // Compute the partial sum/reduction for the work-group.
+        if (LID == 0)
+          Out.get_pointer().get()[NDIt.get_group_linear_id()] =
+              IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), LocalReds[0])
+                                : LocalReds[0];
+      });
+    } else {
+      // Inefficient case: work-groups are not fully loaded
+      // or WGSize is not power of two.
+      // These two inefficient cases are handled by one kernel, which
+      // can be split later into two separate kernels, if there are users who
+      // really need more efficient code for them.
+      using AuxName = typename detail::get_reduction_main_2nd_kernel_name_t<
+          KernelName, KernelType>::name;
+      parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
+        // Call user's functions. Reducer.MValue gets initialized there.
+        typename Reduction::reducer_type Reducer(ReduIdentity);
+        KernelFunc(NDIt, Reducer);
+
+        size_t WGSize = NDIt.get_local_range().size();
+        size_t LID = NDIt.get_local_linear_id();
+        size_t GID = NDIt.get_global_linear_id();
+        // Copy the element to local memory to prepare it for tree-reduction.
+        LocalReds[LID] = (GID < NWorkItems) ? Reducer.MValue : ReduIdentity;
+        LocalReds[WGSize] = ReduIdentity;
+        NDIt.barrier();
+
+        // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]
+        // LocalReds[WGSize] accumulates last/odd elements when the step
+        // of tree-reduction loop is not even.
+        typename Reduction::binary_operation BOp;
+        size_t PrevStep = WGSize;
+        for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
+          if (LID < CurStep)
+            LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
+          else if (LID == CurStep && (PrevStep & 0x1))
+            LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
+          NDIt.barrier();
+          PrevStep = CurStep;
+        }
+
+        // Compute the partial sum/reduction for the work-group.
+        if (LID == 0) {
+          auto GrID = NDIt.get_group_linear_id();
+          auto V = BOp(LocalReds[0], LocalReds[WGSize]);
+          Out.get_pointer().get()[GrID] =
+              IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V;
+        }
+      });
+    }
+  }
+
+  /// Implements a command group function that enqueues a kernel that does one
+  /// iteration of reduction of elements in each of work-groups.
+  /// This version uses tree-reduction algorithm to reduce elements in each
+  /// of work-groups. At the end of each work-group the partial sum is written
+  /// to a global buffer.
+  ///
+  /// Briefly: aux kernel, tree-reduction, CUSTOM types/ops.
+  template <typename KernelName, typename KernelType, int Dims, class Reduction>
+  void reduAuxCGFunc(const nd_range<Dims> &Range, size_t NWorkItems,
+                     size_t KernelRun, Reduction &Redu) {
+    size_t WGSize = Range.get_local_range().size();
+    size_t NWorkGroups = Range.get_group_range().size();
+
+    // The last work-group may be not fully loaded with work, or the work group
+    // size may be not power of those. Those two cases considered inefficient
+    // as they require additional code and checks in the kernel.
+    bool IsUnderLoaded = NWorkGroups * WGSize != NWorkItems;
+    bool IsEfficientCase = !IsUnderLoaded && (WGSize & (WGSize - 1)) == 0;
+
+    bool IsUpdateOfUserAcc =
+        Reduction::accessor_mode == access::mode::read_write &&
+        NWorkGroups == 1;
+
+    // Use local memory to reduce elements in work-groups into 0-th element.
+    // If WGSize is not power of two, then WGSize+1 elements are allocated.
+    // The additional last element is used to catch elements that could
+    // otherwise be lost in the tree-reduction algorithm.
+    size_t NumLocalElements = WGSize + (IsEfficientCase ? 0 : 1);
+    auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, *this);
+
+    // Get read accessor to the buffer that was used as output
+    // in the previous kernel. After that create new output buffer if needed
+    // and get accessor to it (or use reduction's accessor if the kernel
+    // is the last one).
+    auto In = Redu.getReadAccToPreviousPartialReds(*this);
+    auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, KernelRun, *this);
+
+    if (IsEfficientCase) {
+      // Efficient case: work-groups are fully loaded and work-group size
+      // is power of two.
+      using AuxName = typename detail::get_reduction_aux_1st_kernel_name_t<
+          KernelName, KernelType>::name;
+      parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
+        // Copy the element to local memory to prepare it for tree-reduction.
+        size_t LID = NDIt.get_local_linear_id();
+        size_t GID = NDIt.get_global_linear_id();
+        LocalReds[LID] = In[GID];
+        NDIt.barrier();
+
+        // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]
+        typename Reduction::binary_operation BOp;
+        size_t WGSize = NDIt.get_local_range().size();
+        for (size_t CurStep = WGSize >> 1; CurStep > 0; CurStep >>= 1) {
+          if (LID < CurStep)
+            LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
+          NDIt.barrier();
+        }
+
+        // Compute the partial sum/reduction for the work-group.
+        if (LID == 0)
+          Out.get_pointer().get()[NDIt.get_group_linear_id()] =
+              IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), LocalReds[0])
+                                : LocalReds[0];
+      });
+    } else {
+      // Inefficient case: work-groups are not fully loaded
+      // or WGSize is not power of two.
+      // These two inefficient cases are handled by one kernel, which
+      // can be split later into two separate kernels, if there are users
+      // who really need more efficient code for them.
+      using AuxName = typename detail::get_reduction_aux_2nd_kernel_name_t<
+          KernelName, KernelType>::name;
+      auto ReduIdentity = Redu.getIdentity();
+      parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
+        size_t WGSize = NDIt.get_local_range().size();
+        size_t LID = NDIt.get_local_linear_id();
+        size_t GID = NDIt.get_global_linear_id();
+        // Copy the element to local memory to prepare it for tree-reduction
+        LocalReds[LID] = (GID < NWorkItems) ? In[GID] : ReduIdentity;
+        LocalReds[WGSize] = ReduIdentity;
+        NDIt.barrier();
+
+        // Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]
+        // LocalReds[WGSize] accumulates last/odd elements when the step
+        // of tree-reduction loop is not even.
+        typename Reduction::binary_operation BOp;
+        size_t PrevStep = WGSize;
+        for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
+          if (LID < CurStep)
+            LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
+          else if (LID == CurStep && (PrevStep & 0x1))
+            LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
+          NDIt.barrier();
+          PrevStep = CurStep;
+        }
+
+        // Compute the partial sum/reduction for the work-group.
+        if (LID == 0) {
+          auto GrID = NDIt.get_group_linear_id();
+          auto V = BOp(LocalReds[0], LocalReds[WGSize]);
+          Out.get_pointer().get()[GrID] =
+              IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V;
+        }
+      });
+    }
+  }
+
+  /// Defines and invokes a SYCL kernel function for the specified nd_range.
+  /// Performs reduction operation specified in \param Redu.
+  ///
+  /// The SYCL kernel function is defined as a lambda function or a named
+  /// function object type and given an id or item for indexing in the indexing
+  /// space defined by range.
+  /// If it is a named function object and the function object type is
+  /// globally visible, there is no need for the developer to provide
+  /// a kernel name for it.
+  ///
+  /// TODO: currently it calls only those versions of kernels that can handle
+  /// custom types and operations. Some of types and operations may use faster
+  /// implementations that use intel::reduce() and/or sycl::atomic.fetch_<op>()
+  /// functions and thus provide much better performance. Those variants exist,
+  /// are fully functional. They just wait for their time for code-review.
+  /// TODO: Need to handle more than 1 reduction in parallel_for().
+  /// TODO: Support HOST. The kernels called by this parallel_for() may use
+  /// some functionality that is not yet supported on HOST such as:
+  /// barrier(), and intel::reduce() that also may be used in more
+  /// optimized implementations waiting for their turn of code-review.
+  template <typename KernelName = detail::auto_name, typename KernelType,
+            int Dims, typename Reduction>
+  void parallel_for(nd_range<Dims> Range, Reduction &Redu,
+                    KernelType KernelFunc) {
+    size_t NWorkGroups = Range.get_group_range().size();
+
+    // This parallel_for() is lowered to the following sequence:
+    // 1) Call a kernel that a) call user's lambda function and b) performs
+    //    one iteration of reduction, storing the partial reductions/sums
+    //    to either a newly created global buffer or to user's reduction
+    //    accessor. So, if the original 'Range' has totally
+    //    N1 elements and work-group size is W, then after the first iteration
+    //    there will be N2 partial sums where N2 = N1 / W.
+    //    If (N2 == 1) then the partial sum is written to user's accessor.
+    //    Otherwise, a new global buffer is created and partial sums are written
+    //    to it.
+    // 2) Call an aux kernel (if necessary, i.e. if N2 > 1) as many times as
+    //    necessary to reduce all partial sums into one final sum.
+
+    // 1. Call the kernel that includes user's lambda function.
+    // If this kernel is going to be now last one, i.e. it does not write
+    // to user's accessor, then detach user's accessor from this kernel
+    // to make the dependencies between accessors and kernels more clean and
+    // correct.
+    if (NWorkGroups > 1)
+      dissociateWithHandler(Redu.MAcc);
+
+    reduCGFunc<KernelName>(KernelFunc, Range, Redu);
+    auto QueueCopy = MQueue;
+    MLastEvent = this->finalize();
+
+    // 2. Run the additional aux kernel as many times as needed to reduce
+    // all partial sums into one scalar.
+    size_t WGSize = Range.get_local_range().size();
+    size_t NWorkItems = NWorkGroups;
+    size_t KernelRun = 1;
+    while (NWorkItems > 1) {
+      // Before creating another kernel, add the event from the previous kernel
+      // to queue.
+      addEventToQueue(QueueCopy, MLastEvent);
+
+      // TODO: here the work-group size is not limited by user's needs,
+      // the better strategy here is to make the work-group-size as big
+      // as possible.
+      WGSize = std::min(WGSize, NWorkItems);
+      NWorkGroups = NWorkItems / WGSize;
+      // The last group may be not fully loaded. Still register it as a group.
+      if ((NWorkItems % WGSize) != 0)
+        ++NWorkGroups;
+      auto Range =
+          nd_range<1>(range<1>(WGSize * NWorkGroups), range<1>(WGSize));
+
+      handler AuxHandler(QueueCopy, MIsHost);
+      AuxHandler.saveCodeLoc(MCodeLoc);
+
+      // The last kernel DOES write to reduction's accessor.
+      // Associate it with handler manually.
+      if (NWorkGroups == 1)
+        AuxHandler.associateWithHandler(Redu.MAcc);
+      AuxHandler.reduAuxCGFunc<KernelName, KernelType>(Range, NWorkItems,
+                                                       KernelRun, Redu);
+      MLastEvent = AuxHandler.finalize();
+
+      NWorkItems = NWorkGroups;
+      ++KernelRun;
+    } // end while (NWorkItems > 1)
+  }
+
   /// Hierarchical kernel invocation method of a kernel defined as a lambda
   /// encoding the body of each work-group to launch.
   ///
@@ -1368,6 +1750,10 @@ class __SYCL_EXPORT handler {
 
   bool MIsHost = false;
 
+  detail::code_location MCodeLoc = {};
+  bool MIsFinalized = false;
+  event MLastEvent;
+
   // Make queue_impl class friend to be able to call finalize method.
   friend class detail::queue_impl;
   // Make accessor class friend to keep the list of associated accessors.
@@ -1382,6 +1768,11 @@ class __SYCL_EXPORT handler {
   // Make stream class friend to be able to keep the list of associated streams
   friend class stream;
   friend class detail::stream_impl;
+  // Make reduction_impl friend to store buffers and arrays created for it
+  // in handler from reduction_impl methods.
+  template <typename T, class BinaryOperation, int Dims, access::mode AccMode,
+            access::placeholder IsPlaceholder>
+  friend class intel::detail::reduction_impl;
 };
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/CL/sycl/intel/function_pointer.hpp b/sycl/include/CL/sycl/intel/function_pointer.hpp
index 2aa64cdd2e580..f812be911b788 100644
--- a/sycl/include/CL/sycl/intel/function_pointer.hpp
+++ b/sycl/include/CL/sycl/intel/function_pointer.hpp
@@ -81,7 +81,7 @@ device_func_ptr_holder_t get_device_func_ptr(FuncType F, const char *FuncName,
         PI_INVALID_OPERATION);
   }
 
-  return detail::getDeviceFunctionPointerImpl(D, P, FuncName);
+  return sycl::detail::getDeviceFunctionPointerImpl(D, P, FuncName);
 }
 } // namespace intel
 } // namespace sycl
diff --git a/sycl/include/CL/sycl/intel/reduction.hpp b/sycl/include/CL/sycl/intel/reduction.hpp
index 2b0fb264bc94f..3b6289cb03ecc 100644
--- a/sycl/include/CL/sycl/intel/reduction.hpp
+++ b/sycl/include/CL/sycl/intel/reduction.hpp
@@ -342,6 +342,31 @@ class reduction_impl {
            "Only scalar/1-element reductions are supported now.");
   }
 
+  accessor<T, buffer_dim, access::mode::discard_read_write,
+           access::target::local>
+  getReadWriteLocalAcc(size_t Size, handler &CGH) {
+    return accessor<T, buffer_dim, access::mode::discard_read_write,
+                    access::target::local>(Size, CGH);
+  }
+
+  accessor<T, buffer_dim, access::mode::read>
+  getReadAccToPreviousPartialReds(handler &CGH) const {
+    CGH.addReduction(MOutBufPtr);
+    return accessor<T, buffer_dim, access::mode::read>(*MOutBufPtr, CGH);
+  }
+
+  accessor_type getWriteAccForPartialReds(size_t Size, size_t RunNumber,
+                                          handler &CGH) {
+    if (Size == 1) {
+      if (RunNumber > 0)
+        CGH.associateWithHandler(this->MAcc);
+      return this->MAcc;
+    }
+    // Create a new output buffer and return an accessor to it.
+    MOutBufPtr = std::make_shared<buffer<T, buffer_dim>>(range<1>(Size));
+    CGH.addReduction(MOutBufPtr);
+    return accessor_type(*MOutBufPtr, CGH);
+  }
   /// User's accessor to where the reduction must be written.
   accessor_type MAcc;
 
@@ -349,6 +374,7 @@ class reduction_impl {
   /// Identity of the BinaryOperation.
   /// The result of BinaryOperation(X, MIdentity) is equal to X for any X.
   const T MIdentity;
+  shared_ptr_class<buffer<T, buffer_dim>> MOutBufPtr;
 };
 
 } // namespace detail
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index 937223f114c5f..9b65ead92fcde 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -351,6 +351,11 @@ class queue_impl {
   /// \return a native handle.
   pi_native_handle getNative() const;
 
+  /// Stores an event that should be associated with the queue
+  ///
+  /// \param Event is the event to be stored
+  void addEvent(event Event);
+
 private:
   /// Performs command group submission to the queue.
   ///
@@ -362,8 +367,9 @@ class queue_impl {
                     shared_ptr_class<queue_impl> Self,
                     const detail::code_location &Loc) {
     handler Handler(std::move(Self), MHostQueue);
+    Handler.saveCodeLoc(Loc);
     CGF(Handler);
-    event Event = Handler.finalize(Loc);
+    event Event = Handler.finalize();
     addEvent(Event);
     return Event;
   }
@@ -377,11 +383,6 @@ class queue_impl {
   void instrumentationEpilog(void *TelementryEvent, string_class &Name,
                              int32_t StreamID, uint64_t IId);
 
-  /// Stores an event that should be associated with the queue
-  ///
-  /// \param Event is the event to be stored
-  void addEvent(event Event);
-
   /// Stores a USM operation event that should be associated with the queue
   ///
   /// \param Event is the event to be stored
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index ac6baf67bfd25..8685b1060dcdc 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -13,12 +13,24 @@
 #include <CL/sycl/handler.hpp>
 #include <CL/sycl/info/info_desc.hpp>
 #include <detail/kernel_impl.hpp>
+#include <detail/queue_impl.hpp>
 #include <detail/scheduler/scheduler.hpp>
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
-event handler::finalize(const cl::sycl::detail::code_location &Payload) {
-  sycl::event EventRet;
+
+void handler::addEventToQueue(shared_ptr_class<detail::queue_impl> Queue,
+                              cl::sycl::event Event) {
+  Queue->addEvent(std::move(Event));
+}
+
+event handler::finalize() {
+  // This block of code is needed only for reduction implementation.
+  // It is harmless (does nothing) for everything else.
+  if (MIsFinalized)
+    return MLastEvent;
+  MIsFinalized = true;
+
   unique_ptr_class<detail::CG> CommandGroup;
   switch (MCGType) {
   case detail::CG::KERNEL:
@@ -29,14 +41,14 @@ event handler::finalize(const cl::sycl::detail::code_location &Payload) {
         std::move(MSharedPtrStorage), std::move(MRequirements),
         std::move(MEvents), std::move(MArgs), std::move(MKernelName),
         std::move(MOSModuleHandle), std::move(MStreamStorage), MCGType,
-        Payload));
+        MCodeLoc));
     break;
   }
   case detail::CG::INTEROP_TASK_CODEPLAY:
     CommandGroup.reset(new detail::CGInteropTask(
         std::move(MInteropTask), std::move(MArgsStorage),
         std::move(MAccStorage), std::move(MSharedPtrStorage),
-        std::move(MRequirements), std::move(MEvents), MCGType, Payload));
+        std::move(MRequirements), std::move(MEvents), MCGType, MCodeLoc));
     break;
   case detail::CG::COPY_ACC_TO_PTR:
   case detail::CG::COPY_PTR_TO_ACC:
@@ -44,37 +56,37 @@ event handler::finalize(const cl::sycl::detail::code_location &Payload) {
     CommandGroup.reset(new detail::CGCopy(
         MCGType, MSrcPtr, MDstPtr, std::move(MArgsStorage),
         std::move(MAccStorage), std::move(MSharedPtrStorage),
-        std::move(MRequirements), std::move(MEvents), Payload));
+        std::move(MRequirements), std::move(MEvents), MCodeLoc));
     break;
   case detail::CG::FILL:
     CommandGroup.reset(new detail::CGFill(
         std::move(MPattern), MDstPtr, std::move(MArgsStorage),
         std::move(MAccStorage), std::move(MSharedPtrStorage),
-        std::move(MRequirements), std::move(MEvents), Payload));
+        std::move(MRequirements), std::move(MEvents), MCodeLoc));
     break;
   case detail::CG::UPDATE_HOST:
     CommandGroup.reset(new detail::CGUpdateHost(
         MDstPtr, std::move(MArgsStorage), std::move(MAccStorage),
         std::move(MSharedPtrStorage), std::move(MRequirements),
-        std::move(MEvents), Payload));
+        std::move(MEvents), MCodeLoc));
     break;
   case detail::CG::COPY_USM:
     CommandGroup.reset(new detail::CGCopyUSM(
         MSrcPtr, MDstPtr, MLength, std::move(MArgsStorage),
         std::move(MAccStorage), std::move(MSharedPtrStorage),
-        std::move(MRequirements), std::move(MEvents), Payload));
+        std::move(MRequirements), std::move(MEvents), MCodeLoc));
     break;
   case detail::CG::FILL_USM:
     CommandGroup.reset(new detail::CGFillUSM(
         std::move(MPattern), MDstPtr, MLength, std::move(MArgsStorage),
         std::move(MAccStorage), std::move(MSharedPtrStorage),
-        std::move(MRequirements), std::move(MEvents), Payload));
+        std::move(MRequirements), std::move(MEvents), MCodeLoc));
     break;
   case detail::CG::PREFETCH_USM:
     CommandGroup.reset(new detail::CGPrefetchUSM(
         MDstPtr, MLength, std::move(MArgsStorage), std::move(MAccStorage),
         std::move(MSharedPtrStorage), std::move(MRequirements),
-        std::move(MEvents), Payload));
+        std::move(MEvents), MCodeLoc));
     break;
   case detail::CG::NONE:
     throw runtime_error("Command group submitted without a kernel or a "
@@ -88,8 +100,8 @@ event handler::finalize(const cl::sycl::detail::code_location &Payload) {
   detail::EventImplPtr Event = detail::Scheduler::getInstance().addCG(
       std::move(CommandGroup), std::move(MQueue));
 
-  EventRet = detail::createSyclObjFromImpl<event>(Event);
-  return EventRet;
+  MLastEvent = detail::createSyclObjFromImpl<event>(Event);
+  return MLastEvent;
 }
 
 void handler::processArg(void *Ptr, const detail::kernel_param_kind_t &Kind,
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index ccab7147a68c7..700716600350d 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -3232,7 +3232,8 @@ _ZN2cl4sycl7handler10processArgEPvRKNS0_6detail19kernel_param_kind_tEimRmb
 _ZN2cl4sycl7handler13getKernelNameB5cxx11Ev
 _ZN2cl4sycl7handler18extractArgsAndReqsEv
 _ZN2cl4sycl7handler28extractArgsAndReqsFromLambdaEPcmPKNS0_6detail19kernel_param_desc_tE
-_ZN2cl4sycl7handler8finalizeERKNS0_6detail13code_locationE
+_ZN2cl4sycl7handler15addEventToQueueESt10shared_ptrINS0_6detail10queue_implEENS0_5eventE
+_ZN2cl4sycl7handler8finalizeEv
 _ZN2cl4sycl7program17build_with_sourceENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_
 _ZN2cl4sycl7program19compile_with_sourceENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_
 _ZN2cl4sycl7program22build_with_kernel_nameENSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_l
diff --git a/sycl/test/abi/symbol_size.cpp b/sycl/test/abi/symbol_size.cpp
index bfdeaf19df24f..5b595a86978ea 100644
--- a/sycl/test/abi/symbol_size.cpp
+++ b/sycl/test/abi/symbol_size.cpp
@@ -43,7 +43,11 @@ int main() {
   check_size<device_selector, 8>();
   check_size<event, 16>();
   check_size<gpu_selector, 8>();
-  check_size<handler, 472>();
+#ifdef _MSC_VER
+  check_size<handler, 520>();
+#else
+  check_size<handler, 528>();
+#endif
   check_size<image<1>, 16>();
   check_size<kernel, 16>();
   check_size<platform, 16>();
diff --git a/sycl/test/reduction/reduction_nd_conditional.cpp b/sycl/test/reduction/reduction_nd_conditional.cpp
new file mode 100644
index 0000000000000..2c67fda514b04
--- /dev/null
+++ b/sycl/test/reduction/reduction_nd_conditional.cpp
@@ -0,0 +1,121 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(nd_range, reduction, func)
+// with reduction and conditional increment of the reduction variable.
+
+#include <CL/sycl.hpp>
+#include <cassert>
+
+using namespace cl::sycl;
+
+template <typename T, class BinaryOperation>
+void initInputData(buffer<T, 1> &InBuf, T &ExpectedOut, T Identity,
+                   BinaryOperation BOp, size_t N) {
+  ExpectedOut = Identity;
+  auto In = InBuf.template get_access<access::mode::write>();
+  for (int I = 0; I < N; ++I) {
+    if (std::is_same<BinaryOperation, std::multiplies<T>>::value)
+      In[I] = 1 + (((I % 37) == 0) ? 1 : 0);
+    else
+      In[I] = I + 1 + 1.1;
+
+    if (I < 2)
+      ExpectedOut = BOp(ExpectedOut, 99);
+    else if (I % 3)
+      ExpectedOut = BOp(ExpectedOut, In[I]);
+    else
+      ; // do nothing.
+  }
+};
+
+template <typename T, int Dim, class BinaryOperation>
+class SomeClass;
+
+template <typename T>
+struct Vec {
+  Vec() : X(0), Y(0) {}
+  Vec(T X, T Y) : X(X), Y(Y) {}
+  Vec(T V) : X(V), Y(V) {}
+  bool operator==(const Vec &P) const {
+    return P.X == X && P.Y == Y;
+  }
+  bool operator!=(const Vec &P) const {
+    return !(*this == P);
+  }
+  T X;
+  T Y;
+};
+template <typename T>
+bool operator==(const Vec<T> &A, const Vec<T> &B) {
+  return A.X == B.X && A.Y == B.Y;
+}
+template <typename T>
+std::ostream &operator<<(std::ostream &OS, const Vec<T> &P) {
+  return OS << "(" << P.X << ", " << P.Y << ")";
+}
+
+template <class T>
+struct VecPlus {
+  using P = Vec<T>;
+  P operator()(const P &A, const P &B) const {
+    return P(A.X + B.X, A.Y + B.Y);
+  }
+};
+
+template <typename T, int Dim, class BinaryOperation>
+void test(T Identity, size_t WGSize, size_t NWItems) {
+  buffer<T, 1> InBuf(NWItems);
+  buffer<T, 1> OutBuf(1);
+
+  // Initialize.
+  BinaryOperation BOp;
+  T CorrectOut;
+  initInputData(InBuf, CorrectOut, Identity, BOp, NWItems);
+
+  // Compute.
+  queue Q;
+  Q.submit([&](handler &CGH) {
+    auto In = InBuf.template get_access<access::mode::read>(CGH);
+    accessor<T, Dim, access::mode::discard_write, access::target::global_buffer>
+        Out(OutBuf, CGH);
+    auto Redu = intel::reduction(Out, Identity, BOp);
+
+    range<1> GlobalRange(NWItems);
+    range<1> LocalRange(WGSize);
+    nd_range<1> NDRange(GlobalRange, LocalRange);
+    CGH.parallel_for<SomeClass<T, Dim, BinaryOperation>>(
+        NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) {
+          size_t I = NDIt.get_global_linear_id();
+          if (I < 2)
+            Sum.combine(T(99));
+          else if (I % 3)
+            Sum.combine(In[I]);
+          else
+            ; // do nothing.
+        });
+  });
+
+  // Check correctness.
+  auto Out = OutBuf.template get_access<access::mode::read>();
+  T ComputedOut = *(Out.get_pointer());
+  if (ComputedOut != CorrectOut) {
+    std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n";
+    std::cout << "Computed value: " << ComputedOut
+              << ", Expected value: " << CorrectOut << "\n";
+    assert(0 && "Wrong value.");
+  }
+}
+
+int main() {
+  test<int, 0, intel::plus<int>>(0, 2, 2);
+  test<int, 1, intel::plus<int>>(0, 7, 7);
+  test<int, 0, intel::plus<int>>(0, 2, 64);
+  test<short, 1, intel::plus<short>>(0, 16, 256);
+
+  std::cout << "Test passed\n";
+  return 0;
+}
diff --git a/sycl/test/reduction/reduction_nd_s0_dw.cpp b/sycl/test/reduction/reduction_nd_s0_dw.cpp
new file mode 100644
index 0000000000000..a3a9a8095e0ad
--- /dev/null
+++ b/sycl/test/reduction/reduction_nd_s0_dw.cpp
@@ -0,0 +1,96 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(nd_range, reduction, func)
+// with reductions initialized with 0-dimensional discard_write accessor.
+
+#include "reduction_utils.hpp"
+#include <CL/sycl.hpp>
+#include <cassert>
+
+using namespace cl::sycl;
+
+template <typename T, int Dim, class BinaryOperation>
+class SomeClass;
+
+template <typename T, int Dim, class BinaryOperation>
+void test(T Identity, size_t WGSize, size_t NWItems) {
+  buffer<T, 1> InBuf(NWItems);
+  buffer<T, 1> OutBuf(1);
+
+  // Initialize.
+  BinaryOperation BOp;
+  T CorrectOut;
+  initInputData(InBuf, CorrectOut, Identity, BOp, NWItems);
+
+  // Compute.
+  queue Q;
+  Q.submit([&](handler &CGH) {
+    auto In = InBuf.template get_access<access::mode::read>(CGH);
+    accessor<T, Dim, access::mode::discard_write, access::target::global_buffer>
+        Out(OutBuf, CGH);
+    auto Redu = intel::reduction(Out, Identity, BOp);
+
+    range<1> GlobalRange(NWItems);
+    range<1> LocalRange(WGSize);
+    nd_range<1> NDRange(GlobalRange, LocalRange);
+    CGH.parallel_for<SomeClass<T, Dim, BinaryOperation>>(
+        NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) {
+          Sum.combine(In[NDIt.get_global_linear_id()]);
+        });
+  });
+
+  // Check correctness.
+  auto Out = OutBuf.template get_access<access::mode::read>();
+  T ComputedOut = *(Out.get_pointer());
+  if (ComputedOut != CorrectOut) {
+    std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n";
+    std::cout << "Computed value: " << ComputedOut
+              << ", Expected value: " << CorrectOut << "\n";
+    assert(0 && "Wrong value.");
+  }
+}
+
+int main() {
+  // Check some less standards WG sizes and corner cases first.
+  test<int, 0, intel::plus<int>>(0, 2, 2);
+  test<int, 0, intel::plus<int>>(0, 7, 7);
+  test<int, 0, intel::plus<int>>(0, 9, 18);
+  test<int, 0, intel::plus<int>>(0, 49, 49 * 5);
+
+  // Try some power-of-two work-group sizes.
+  test<int, 0, intel::plus<int>>(0, 2, 64);
+  test<int, 0, intel::plus<int>>(0, 4, 64);
+  test<int, 0, intel::plus<int>>(0, 8, 128);
+  test<int, 0, intel::plus<int>>(0, 16, 256);
+  test<int, 0, intel::plus<int>>(0, 32, 256);
+  test<int, 0, intel::plus<int>>(0, 64, 256);
+  test<int, 0, intel::plus<int>>(0, 128, 256);
+  test<int, 0, intel::plus<int>>(0, 256, 256);
+
+  // Check with various operations.
+  test<int, 0, std::multiplies<int>>(1, 8, 256);
+  test<int, 0, intel::bit_or<int>>(0, 8, 256);
+  test<int, 0, intel::bit_xor<int>>(0, 8, 256);
+  test<int, 0, intel::bit_and<int>>(~0, 8, 256);
+  test<int, 0, intel::minimum<int>>(std::numeric_limits<int>::max(), 8, 256);
+  test<int, 0, intel::maximum<int>>(std::numeric_limits<int>::min(), 8, 256);
+
+  // Check with various types.
+  test<float, 0, std::multiplies<float>>(1, 8, 256);
+  test<float, 0, intel::minimum<float>>(std::numeric_limits<float>::max(), 8, 256);
+  test<float, 0, intel::maximum<float>>(std::numeric_limits<float>::min(), 8, 256);
+
+  test<double, 0, std::multiplies<double>>(1, 8, 256);
+  test<double, 0, intel::minimum<double>>(std::numeric_limits<double>::max(), 8, 256);
+  test<double, 0, intel::maximum<double>>(std::numeric_limits<double>::min(), 8, 256);
+
+  // Check with CUSTOM type.
+  test<CustomVec<long long>, 0, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
+
+  std::cout << "Test passed\n";
+  return 0;
+}
diff --git a/sycl/test/reduction/reduction_nd_s0_rw.cpp b/sycl/test/reduction/reduction_nd_s0_rw.cpp
new file mode 100644
index 0000000000000..7f58d311424e8
--- /dev/null
+++ b/sycl/test/reduction/reduction_nd_s0_rw.cpp
@@ -0,0 +1,98 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(nd_range, reduction, func)
+// with reductions initialized with 0-dimensional read_write accessor.
+
+#include "reduction_utils.hpp"
+#include <CL/sycl.hpp>
+#include <cassert>
+
+using namespace cl::sycl;
+
+template <typename T, int Dim, class BinaryOperation>
+class SomeClass;
+
+template <typename T, int Dim, class BinaryOperation>
+void test(T Identity, size_t WGSize, size_t NWItems) {
+  buffer<T, 1> InBuf(NWItems);
+  buffer<T, 1> OutBuf(1);
+
+  // Initialize.
+  BinaryOperation BOp;
+  T CorrectOut;
+  initInputData(InBuf, CorrectOut, Identity, BOp, NWItems);
+
+  (OutBuf.template get_access<access::mode::write>())[0] = Identity;
+
+  // Compute.
+  queue Q;
+  Q.submit([&](handler &CGH) {
+    auto In = InBuf.template get_access<access::mode::read>(CGH);
+    accessor<T, Dim, access::mode::read_write, access::target::global_buffer>
+        Out(OutBuf, CGH);
+    auto Redu = intel::reduction(Out, Identity, BOp);
+
+    range<1> GlobalRange(NWItems);
+    range<1> LocalRange(WGSize);
+    nd_range<1> NDRange(GlobalRange, LocalRange);
+    CGH.parallel_for<SomeClass<T, Dim, BinaryOperation>>(
+        NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) {
+          Sum.combine(In[NDIt.get_global_linear_id()]);
+        });
+  });
+
+  // Check correctness.
+  auto Out = OutBuf.template get_access<access::mode::read>();
+  T ComputedOut = *(Out.get_pointer());
+  if (ComputedOut != CorrectOut) {
+    std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n";
+    std::cout << "Computed value: " << ComputedOut
+              << ", Expected value: " << CorrectOut << "\n";
+    assert(0 && "Wrong value.");
+  }
+}
+
+int main() {
+  // Check some less standards WG sizes and corner cases first.
+  test<int, 0, intel::plus<int>>(0, 2, 2);
+  test<int, 0, intel::plus<int>>(0, 7, 7);
+  test<int, 0, intel::plus<int>>(0, 9, 18);
+  test<int, 0, intel::plus<int>>(0, 49, 49 * 5);
+
+  // Try some power-of-two work-group sizes.
+  test<int, 0, intel::plus<int>>(0, 2, 64);
+  test<int, 0, intel::plus<int>>(0, 4, 64);
+  test<int, 0, intel::plus<int>>(0, 8, 128);
+  test<int, 0, intel::plus<int>>(0, 16, 256);
+  test<int, 0, intel::plus<int>>(0, 32, 256);
+  test<int, 0, intel::plus<int>>(0, 64, 256);
+  test<int, 0, intel::plus<int>>(0, 128, 256);
+  test<int, 0, intel::plus<int>>(0, 256, 256);
+
+  // Check with various operations.
+  test<int, 0, std::multiplies<int>>(1, 8, 256);
+  test<int, 0, intel::bit_or<int>>(0, 8, 256);
+  test<int, 0, intel::bit_xor<int>>(0, 8, 256);
+  test<int, 0, intel::bit_and<int>>(~0, 8, 256);
+  test<int, 0, intel::minimum<int>>(std::numeric_limits<int>::max(), 8, 256);
+  test<int, 0, intel::maximum<int>>(std::numeric_limits<int>::min(), 8, 256);
+
+  // Check with various types.
+  test<float, 0, std::multiplies<float>>(1, 8, 256);
+  test<float, 0, intel::minimum<float>>(std::numeric_limits<float>::max(), 8, 256);
+  test<float, 0, intel::maximum<float>>(std::numeric_limits<float>::min(), 8, 256);
+
+  test<double, 0, std::multiplies<double>>(1, 8, 256);
+  test<double, 0, intel::minimum<double>>(std::numeric_limits<double>::max(), 8, 256);
+  test<double, 0, intel::maximum<double>>(std::numeric_limits<double>::min(), 8, 256);
+
+  // Check with CUSTOM type.
+  test<CustomVec<long long>, 0, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
+
+  std::cout << "Test passed\n";
+  return 0;
+}
diff --git a/sycl/test/reduction/reduction_nd_s1_dw.cpp b/sycl/test/reduction/reduction_nd_s1_dw.cpp
new file mode 100644
index 0000000000000..7b0ca8aeebfe5
--- /dev/null
+++ b/sycl/test/reduction/reduction_nd_s1_dw.cpp
@@ -0,0 +1,97 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(nd_range, reduction, func)
+// with reductions initialized with 1-dimensional discard_write accessor
+// accessing 1 element buffer.
+
+#include "reduction_utils.hpp"
+#include <CL/sycl.hpp>
+#include <cassert>
+
+using namespace cl::sycl;
+
+template <typename T, int Dim, class BinaryOperation>
+class SomeClass;
+
+template <typename T, int Dim, class BinaryOperation>
+void test(T Identity, size_t WGSize, size_t NWItems) {
+  buffer<T, 1> InBuf(NWItems);
+  buffer<T, 1> OutBuf(1);
+
+  // Initialize.
+  BinaryOperation BOp;
+  T CorrectOut;
+  initInputData(InBuf, CorrectOut, Identity, BOp, NWItems);
+
+  // Compute.
+  queue Q;
+  Q.submit([&](handler &CGH) {
+    auto In = InBuf.template get_access<access::mode::read>(CGH);
+    accessor<T, Dim, access::mode::discard_write, access::target::global_buffer>
+        Out(OutBuf, CGH);
+    auto Redu = intel::reduction(Out, Identity, BOp);
+
+    range<1> GlobalRange(NWItems);
+    range<1> LocalRange(WGSize);
+    nd_range<1> NDRange(GlobalRange, LocalRange);
+    CGH.parallel_for<SomeClass<T, Dim, BinaryOperation>>(
+        NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) {
+          Sum.combine(In[NDIt.get_global_linear_id()]);
+        });
+  });
+
+  // Check correctness.
+  auto Out = OutBuf.template get_access<access::mode::read>();
+  T ComputedOut = *(Out.get_pointer());
+  if (ComputedOut != CorrectOut) {
+    std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n";
+    std::cout << "Computed value: " << ComputedOut
+              << ", Expected value: " << CorrectOut << "\n";
+    assert(0 && "Wrong value.");
+  }
+}
+
+int main() {
+  // Check some less standards WG sizes and corner cases first.
+  test<int, 1, intel::plus<int>>(0, 2, 2);
+  test<int, 1, intel::plus<int>>(0, 7, 7);
+  test<int, 1, intel::plus<int>>(0, 9, 18);
+  test<int, 1, intel::plus<int>>(0, 49, 49 * 5);
+
+  // Try some power-of-two work-group sizes.
+  test<int, 1, intel::plus<int>>(0, 2, 64);
+  test<int, 1, intel::plus<int>>(0, 4, 64);
+  test<int, 1, intel::plus<int>>(0, 8, 128);
+  test<int, 1, intel::plus<int>>(0, 16, 256);
+  test<int, 1, intel::plus<int>>(0, 32, 256);
+  test<int, 1, intel::plus<int>>(0, 64, 256);
+  test<int, 1, intel::plus<int>>(0, 128, 256);
+  test<int, 1, intel::plus<int>>(0, 256, 256);
+
+  // Check with various operations.
+  test<int, 1, std::multiplies<int>>(1, 8, 256);
+  test<int, 1, intel::bit_or<int>>(0, 8, 256);
+  test<int, 1, intel::bit_xor<int>>(0, 8, 256);
+  test<int, 1, intel::bit_and<int>>(~0, 8, 256);
+  test<int, 1, intel::minimum<int>>(std::numeric_limits<int>::max(), 8, 256);
+  test<int, 1, intel::maximum<int>>(std::numeric_limits<int>::min(), 8, 256);
+
+  // Check with various types.
+  test<float, 1, std::multiplies<float>>(1, 8, 256);
+  test<float, 1, intel::minimum<float>>(std::numeric_limits<float>::max(), 8, 256);
+  test<float, 1, intel::maximum<float>>(std::numeric_limits<float>::min(), 8, 256);
+
+  test<double, 1, std::multiplies<double>>(1, 8, 256);
+  test<double, 1, intel::minimum<double>>(std::numeric_limits<double>::max(), 8, 256);
+  test<double, 1, intel::maximum<double>>(std::numeric_limits<double>::min(), 8, 256);
+
+  // Check with CUSTOM type.
+  test<CustomVec<long long>, 1, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
+
+  std::cout << "Test passed\n";
+  return 0;
+}
diff --git a/sycl/test/reduction/reduction_nd_s1_rw.cpp b/sycl/test/reduction/reduction_nd_s1_rw.cpp
new file mode 100644
index 0000000000000..333b03bf40cb0
--- /dev/null
+++ b/sycl/test/reduction/reduction_nd_s1_rw.cpp
@@ -0,0 +1,99 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(nd_range, reduction, func)
+// with reductions initialized with 1-dimensional read_write accessor
+// accessing 1 element buffer.
+
+#include "reduction_utils.hpp"
+#include <CL/sycl.hpp>
+#include <cassert>
+
+using namespace cl::sycl;
+
+template <typename T, int Dim, class BinaryOperation>
+class SomeClass;
+
+template <typename T, int Dim, class BinaryOperation>
+void test(T Identity, size_t WGSize, size_t NWItems) {
+  buffer<T, 1> InBuf(NWItems);
+  buffer<T, 1> OutBuf(1);
+
+  // Initialize.
+  BinaryOperation BOp;
+  T CorrectOut;
+  initInputData(InBuf, CorrectOut, Identity, BOp, NWItems);
+
+  (OutBuf.template get_access<access::mode::write>())[0] = Identity;
+
+  // Compute.
+  queue Q;
+  Q.submit([&](handler &CGH) {
+    auto In = InBuf.template get_access<access::mode::read>(CGH);
+    accessor<T, Dim, access::mode::read_write, access::target::global_buffer>
+        Out(OutBuf, CGH);
+    auto Redu = intel::reduction(Out, Identity, BOp);
+
+    range<1> GlobalRange(NWItems);
+    range<1> LocalRange(WGSize);
+    nd_range<1> NDRange(GlobalRange, LocalRange);
+    CGH.parallel_for<SomeClass<T, Dim, BinaryOperation>>(
+        NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) {
+          Sum.combine(In[NDIt.get_global_linear_id()]);
+        });
+  });
+
+  // Check correctness.
+  auto Out = OutBuf.template get_access<access::mode::read>();
+  T ComputedOut = *(Out.get_pointer());
+  if (ComputedOut != CorrectOut) {
+    std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n";
+    std::cout << "Computed value: " << ComputedOut
+              << ", Expected value: " << CorrectOut << "\n";
+    assert(0 && "Wrong value.");
+  }
+}
+
+int main() {
+  // Check some less standards WG sizes and corner cases first.
+  test<int, 1, intel::plus<int>>(0, 2, 2);
+  test<int, 1, intel::plus<int>>(0, 7, 7);
+  test<int, 1, intel::plus<int>>(0, 9, 18);
+  test<int, 1, intel::plus<int>>(0, 49, 49 * 5);
+
+  // Try some power-of-two work-group sizes.
+  test<int, 1, intel::plus<int>>(0, 2, 64);
+  test<int, 1, intel::plus<int>>(0, 4, 64);
+  test<int, 1, intel::plus<int>>(0, 8, 128);
+  test<int, 1, intel::plus<int>>(0, 16, 256);
+  test<int, 1, intel::plus<int>>(0, 32, 256);
+  test<int, 1, intel::plus<int>>(0, 64, 256);
+  test<int, 1, intel::plus<int>>(0, 128, 256);
+  test<int, 1, intel::plus<int>>(0, 256, 256);
+
+  // Check with various operations.
+  test<int, 1, std::multiplies<int>>(1, 8, 256);
+  test<int, 1, intel::bit_or<int>>(0, 8, 256);
+  test<int, 1, intel::bit_xor<int>>(0, 8, 256);
+  test<int, 1, intel::bit_and<int>>(~0, 8, 256);
+  test<int, 1, intel::minimum<int>>(std::numeric_limits<int>::max(), 8, 256);
+  test<int, 1, intel::maximum<int>>(std::numeric_limits<int>::min(), 8, 256);
+
+  // Check with various types.
+  test<float, 1, std::multiplies<float>>(1, 8, 256);
+  test<float, 1, intel::minimum<float>>(std::numeric_limits<float>::max(), 8, 256);
+  test<float, 1, intel::maximum<float>>(std::numeric_limits<float>::min(), 8, 256);
+
+  test<double, 1, std::multiplies<double>>(1, 8, 256);
+  test<double, 1, intel::minimum<double>>(std::numeric_limits<double>::max(), 8, 256);
+  test<double, 1, intel::maximum<double>>(std::numeric_limits<double>::min(), 8, 256);
+
+  // Check with CUSTOM type.
+  test<CustomVec<long long>, 1, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
+
+  std::cout << "Test passed\n";
+  return 0;
+}
diff --git a/sycl/test/reduction/reduction_transparent.cpp b/sycl/test/reduction/reduction_transparent.cpp
new file mode 100644
index 0000000000000..d9392fc2784a4
--- /dev/null
+++ b/sycl/test/reduction/reduction_transparent.cpp
@@ -0,0 +1,66 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(nd_range, reduction, func)
+// where func is a transparent functor.
+
+#include "reduction_utils.hpp"
+#include <CL/sycl.hpp>
+#include <cassert>
+
+using namespace cl::sycl;
+
+template <typename T, int Dim, class BinaryOperation>
+class SomeClass;
+
+template <typename T, int Dim, class BinaryOperation>
+void test(T Identity, size_t WGSize, size_t NWItems) {
+  buffer<T, 1> InBuf(NWItems);
+  buffer<T, 1> OutBuf(1);
+
+  // Initialize.
+  BinaryOperation BOp;
+  T CorrectOut;
+  initInputData(InBuf, CorrectOut, Identity, BOp, NWItems);
+
+  // Compute.
+  queue Q;
+  Q.submit([&](handler &CGH) {
+    auto In = InBuf.template get_access<access::mode::read>(CGH);
+    accessor<T, Dim, access::mode::discard_write, access::target::global_buffer>
+        Out(OutBuf, CGH);
+    auto Redu = intel::reduction(Out, Identity, BOp);
+
+    range<1> GlobalRange(NWItems);
+    range<1> LocalRange(WGSize);
+    nd_range<1> NDRange(GlobalRange, LocalRange);
+    CGH.parallel_for<SomeClass<T, Dim, BinaryOperation>>(
+        NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) {
+          Sum.combine(In[NDIt.get_global_linear_id()]);
+        });
+  });
+
+  // Check correctness.
+  auto Out = OutBuf.template get_access<access::mode::read>();
+  T ComputedOut = *(Out.get_pointer());
+  if (ComputedOut != CorrectOut) {
+    std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n";
+    std::cout << "Computed value: " << ComputedOut
+              << ", Expected value: " << CorrectOut << "\n";
+    assert(0 && "Wrong value.");
+  }
+}
+
+int main() {
+#if __cplusplus >= 201402L
+  test<double, 0, intel::maximum<>>(std::numeric_limits<double>::min(), 7, 7 * 5);
+  test<signed char, 0, intel::plus<>>(0, 7, 49);
+  test<unsigned char, 1, std::multiplies<>>(1, 4, 16);
+#endif
+
+  std::cout << "Test passed\n";
+  return 0;
+}
diff --git a/sycl/test/reduction/reduction_utils.hpp b/sycl/test/reduction/reduction_utils.hpp
new file mode 100644
index 0000000000000..c7a1699298f02
--- /dev/null
+++ b/sycl/test/reduction/reduction_utils.hpp
@@ -0,0 +1,54 @@
+#include <CL/sycl.hpp>
+
+using namespace cl::sycl;
+
+// Initializes 'InBuf' buffer with pseudo-random values, computes the reduction
+// value for the buffer and writes it to 'ExpectedOut'.
+template <typename T, class BinaryOperation>
+void initInputData(buffer<T, 1> &InBuf, T &ExpectedOut, T Identity,
+                   BinaryOperation BOp, size_t N) {
+  ExpectedOut = Identity;
+  auto In = InBuf.template get_access<access::mode::write>();
+  for (int I = 0; I < N; ++I) {
+    if (std::is_same<BinaryOperation, std::multiplies<T>>::value)
+      In[I] = 1 + (((I % 37) == 0) ? 1 : 0);
+    else
+      In[I] = ((I + 1) % 5) + 1.1;
+    ExpectedOut = BOp(ExpectedOut, In[I]);
+  }
+};
+
+// This type is needed only to check that custom types are properly handled
+// in parallel_for() with reduction. For simplicity it needs a default
+// constructor, a constructor with one argument, operators ==, != and
+// printing to a stream.
+template <typename T>
+struct CustomVec {
+  CustomVec() : X(0), Y(0) {}
+  CustomVec(T X, T Y) : X(X), Y(Y) {}
+  CustomVec(T V) : X(V), Y(V) {}
+  bool operator==(const CustomVec &V) const {
+    return V.X == X && V.Y == Y;
+  }
+  bool operator!=(const CustomVec &V) const {
+    return !(*this == V);
+  }
+  T X;
+  T Y;
+};
+template <typename T>
+bool operator==(const CustomVec<T> &A, const CustomVec<T> &B) {
+  return A.X == B.X && A.Y == B.Y;
+}
+template <typename T>
+std::ostream &operator<<(std::ostream &OS, const CustomVec<T> &V) {
+  return OS << "(" << V.X << ", " << V.Y << ")";
+}
+
+template <class T>
+struct CustomVecPlus {
+  using CV = CustomVec<T>;
+  CV operator()(const CV &A, const CV &B) const {
+    return CV(A.X + B.X, A.Y + B.Y);
+  }
+};