diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index 8044a09293c80..602a78cf56c73 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -853,7 +853,7 @@ class __SYCL_EXPORT handler {
             int Dims, typename Reduction>
   detail::enable_if_t<Reduction::accessor_mode == access::mode::read_write &&
                       Reduction::has_fast_atomics>
-  parallel_for(nd_range<Dims> Range, Reduction &Redu, KernelType KernelFunc) {
+  parallel_for(nd_range<Dims> Range, Reduction Redu, KernelType KernelFunc) {
     if (Reduction::is_usm)
       Redu.associateWithHandler(*this);
     shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
@@ -886,7 +886,7 @@ class __SYCL_EXPORT handler {
             int Dims, typename Reduction>
   detail::enable_if_t<Reduction::accessor_mode == access::mode::discard_write &&
                       Reduction::has_fast_atomics>
-  parallel_for(nd_range<Dims> Range, Reduction &Redu, KernelType KernelFunc) {
+  parallel_for(nd_range<Dims> Range, Reduction Redu, KernelType KernelFunc) {
     shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
     auto RWAcc = Redu.getReadWriteScalarAcc(*this);
     intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
@@ -920,7 +920,7 @@ class __SYCL_EXPORT handler {
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims, typename Reduction>
   detail::enable_if_t<!Reduction::has_fast_atomics>
-  parallel_for(nd_range<Dims> Range, Reduction &Redu, KernelType KernelFunc) {
+  parallel_for(nd_range<Dims> Range, Reduction Redu, KernelType KernelFunc) {
     size_t NWorkGroups = Range.get_group_range().size();
 
     // This parallel_for() is lowered to the following sequence:
diff --git a/sycl/include/CL/sycl/intel/reduction.hpp b/sycl/include/CL/sycl/intel/reduction.hpp
index 7a996f365915d..4b9bec9829c81 100644
--- a/sycl/include/CL/sycl/intel/reduction.hpp
+++ b/sycl/include/CL/sycl/intel/reduction.hpp
@@ -26,65 +26,105 @@ using cl::sycl::detail::is_geninteger8bit;
 using cl::sycl::detail::remove_AS;
 
 template <typename T, class BinaryOperation>
-using IsReduOptForFastAtomicFetch = detail::bool_constant<
-    (is_geninteger32bit<T>::value || is_geninteger64bit<T>::value) &&
-    (std::is_same<BinaryOperation, intel::plus<T>>::value ||
-     std::is_same<BinaryOperation, intel::minimum<T>>::value ||
-     std::is_same<BinaryOperation, intel::maximum<T>>::value ||
-     std::is_same<BinaryOperation, intel::bit_or<T>>::value ||
-     std::is_same<BinaryOperation, intel::bit_xor<T>>::value ||
-     std::is_same<BinaryOperation, intel::bit_and<T>>::value)>;
+using IsReduPlus = detail::bool_constant<
+    std::is_same<BinaryOperation, intel::plus<T>>::value ||
+    std::is_same<BinaryOperation, intel::plus<void>>::value>;
+
+template <typename T, class BinaryOperation>
+using IsReduMultiplies = detail::bool_constant<
+    std::is_same<BinaryOperation, std::multiplies<T>>::value ||
+    std::is_same<BinaryOperation, std::multiplies<void>>::value>;
+
+template <typename T, class BinaryOperation>
+using IsReduMinimum = detail::bool_constant<
+    std::is_same<BinaryOperation, intel::minimum<T>>::value ||
+    std::is_same<BinaryOperation, intel::minimum<void>>::value>;
+
+template <typename T, class BinaryOperation>
+using IsReduMaximum = detail::bool_constant<
+    std::is_same<BinaryOperation, intel::maximum<T>>::value ||
+    std::is_same<BinaryOperation, intel::maximum<void>>::value>;
+
+template <typename T, class BinaryOperation>
+using IsReduBitOR = detail::bool_constant<
+    std::is_same<BinaryOperation, intel::bit_or<T>>::value ||
+    std::is_same<BinaryOperation, intel::bit_or<void>>::value>;
+
+template <typename T, class BinaryOperation>
+using IsReduBitXOR = detail::bool_constant<
+    std::is_same<BinaryOperation, intel::bit_xor<T>>::value ||
+    std::is_same<BinaryOperation, intel::bit_xor<void>>::value>;
+
+template <typename T, class BinaryOperation>
+using IsReduBitAND = detail::bool_constant<
+    std::is_same<BinaryOperation, intel::bit_and<T>>::value ||
+    std::is_same<BinaryOperation, intel::bit_and<void>>::value>;
+
+template <typename T, class BinaryOperation>
+using IsReduOptForFastAtomicFetch =
+    detail::bool_constant<(is_geninteger32bit<T>::value ||
+                           is_geninteger64bit<T>::value) &&
+                          (IsReduPlus<T, BinaryOperation>::value ||
+                           IsReduMinimum<T, BinaryOperation>::value ||
+                           IsReduMaximum<T, BinaryOperation>::value ||
+                           IsReduBitOR<T, BinaryOperation>::value ||
+                           IsReduBitXOR<T, BinaryOperation>::value ||
+                           IsReduBitAND<T, BinaryOperation>::value)>;
 
 template <typename T, class BinaryOperation>
 using IsReduOptForFastReduce = detail::bool_constant<
     (is_geninteger32bit<T>::value || is_geninteger64bit<T>::value ||
      std::is_same<T, half>::value || std::is_same<T, float>::value ||
      std::is_same<T, double>::value) &&
-    (std::is_same<BinaryOperation, intel::plus<T>>::value ||
-     std::is_same<BinaryOperation, intel::minimum<T>>::value ||
-     std::is_same<BinaryOperation, intel::maximum<T>>::value)>;
+    (IsReduPlus<T, BinaryOperation>::value ||
+     IsReduMinimum<T, BinaryOperation>::value ||
+     IsReduMaximum<T, BinaryOperation>::value)>;
 
 // Identity = 0
 template <typename T, class BinaryOperation>
 using IsZeroIdentityOp = bool_constant<
     ((is_geninteger8bit<T>::value || is_geninteger16bit<T>::value ||
       is_geninteger32bit<T>::value || is_geninteger64bit<T>::value) &&
-     (std::is_same<BinaryOperation, intel::plus<T>>::value ||
-      std::is_same<BinaryOperation, intel::bit_or<T>>::value ||
-      std::is_same<BinaryOperation, intel::bit_xor<T>>::value)) ||
-    ((std::is_same<T, float>::value || std::is_same<T, double>::value) &&
-     std::is_same<BinaryOperation, intel::plus<T>>::value)>;
+     (IsReduPlus<T, BinaryOperation>::value ||
+      IsReduBitOR<T, BinaryOperation>::value ||
+      IsReduBitXOR<T, BinaryOperation>::value)) ||
+    ((std::is_same<T, half>::value || std::is_same<T, float>::value ||
+      std::is_same<T, double>::value) &&
+     IsReduPlus<T, BinaryOperation>::value)>;
 
 // Identity = 1
 template <typename T, class BinaryOperation>
 using IsOneIdentityOp = bool_constant<
     (is_geninteger8bit<T>::value || is_geninteger16bit<T>::value ||
      is_geninteger32bit<T>::value || is_geninteger64bit<T>::value ||
-     std::is_same<T, float>::value || std::is_same<T, double>::value) &&
-    std::is_same<BinaryOperation, std::multiplies<T>>::value>;
+     std::is_same<T, half>::value || std::is_same<T, float>::value ||
+     std::is_same<T, double>::value) &&
+    IsReduMultiplies<T, BinaryOperation>::value>;
 
 // Identity = ~0
 template <typename T, class BinaryOperation>
 using IsOnesIdentityOp = bool_constant<
     (is_geninteger8bit<T>::value || is_geninteger16bit<T>::value ||
      is_geninteger32bit<T>::value || is_geninteger64bit<T>::value) &&
-    std::is_same<BinaryOperation, intel::bit_and<T>>::value>;
+    IsReduBitAND<T, BinaryOperation>::value>;
 
 // Identity = <max possible value>
 template <typename T, class BinaryOperation>
 using IsMinimumIdentityOp = bool_constant<
     (is_geninteger8bit<T>::value || is_geninteger16bit<T>::value ||
      is_geninteger32bit<T>::value || is_geninteger64bit<T>::value ||
-     std::is_same<T, float>::value || std::is_same<T, double>::value) &&
-    std::is_same<BinaryOperation, intel::minimum<T>>::value>;
+     std::is_same<T, half>::value || std::is_same<T, float>::value ||
+     std::is_same<T, double>::value) &&
+    IsReduMinimum<T, BinaryOperation>::value>;
 
 // Identity = <min possible value>
 template <typename T, class BinaryOperation>
 using IsMaximumIdentityOp = bool_constant<
     (is_geninteger8bit<T>::value || is_geninteger16bit<T>::value ||
      is_geninteger32bit<T>::value || is_geninteger64bit<T>::value ||
-     std::is_same<T, float>::value || std::is_same<T, double>::value) &&
-    std::is_same<BinaryOperation, intel::maximum<T>>::value>;
+     std::is_same<T, half>::value || std::is_same<T, float>::value ||
+     std::is_same<T, double>::value) &&
+    IsReduMaximum<T, BinaryOperation>::value>;
 
 template <typename T, class BinaryOperation>
 using IsKnownIdentityOp =
@@ -169,19 +209,23 @@ class reducer<T, BinaryOperation,
   template <typename _T = T, class _BinaryOperation = BinaryOperation>
   static enable_if_t<IsMinimumIdentityOp<_T, _BinaryOperation>::value, _T>
   getIdentity() {
-    return (std::numeric_limits<_T>::max)();
+    return std::numeric_limits<_T>::has_infinity
+               ? std::numeric_limits<_T>::infinity()
+               : (std::numeric_limits<_T>::max)();
   }
 
   /// Returns minimal possible value as identity for MAX operations.
   template <typename _T = T, class _BinaryOperation = BinaryOperation>
   static enable_if_t<IsMaximumIdentityOp<_T, _BinaryOperation>::value, _T>
   getIdentity() {
-    return (std::numeric_limits<_T>::min)();
+    return std::numeric_limits<_T>::has_infinity
+               ? -std::numeric_limits<_T>::infinity()
+               : std::numeric_limits<_T>::lowest();
   }
 
   template <typename _T = T>
   enable_if_t<std::is_same<_T, T>::value &&
-                  std::is_same<BinaryOperation, intel::plus<T>>::value,
+                  IsReduPlus<T, BinaryOperation>::value,
               reducer &>
   operator+=(const _T &Partial) {
     combine(Partial);
@@ -190,7 +234,7 @@ class reducer<T, BinaryOperation,
 
   template <typename _T = T>
   enable_if_t<std::is_same<_T, T>::value &&
-                  std::is_same<BinaryOperation, std::multiplies<T>>::value,
+                  IsReduMultiplies<T, BinaryOperation>::value,
               reducer &>
   operator*=(const _T &Partial) {
     combine(Partial);
@@ -199,7 +243,7 @@ class reducer<T, BinaryOperation,
 
   template <typename _T = T>
   enable_if_t<std::is_same<_T, T>::value &&
-                  std::is_same<BinaryOperation, intel::bit_or<T>>::value,
+                  IsReduBitOR<T, BinaryOperation>::value,
               reducer &>
   operator|=(const _T &Partial) {
     combine(Partial);
@@ -208,7 +252,7 @@ class reducer<T, BinaryOperation,
 
   template <typename _T = T>
   enable_if_t<std::is_same<_T, T>::value &&
-                  std::is_same<BinaryOperation, intel::bit_xor<T>>::value,
+                  IsReduBitXOR<T, BinaryOperation>::value,
               reducer &>
   operator^=(const _T &Partial) {
     combine(Partial);
@@ -217,7 +261,7 @@ class reducer<T, BinaryOperation,
 
   template <typename _T = T>
   enable_if_t<std::is_same<_T, T>::value &&
-                  std::is_same<BinaryOperation, intel::bit_and<T>>::value,
+                  IsReduBitAND<T, BinaryOperation>::value,
               reducer &>
   operator&=(const _T &Partial) {
     combine(Partial);
@@ -228,7 +272,7 @@ class reducer<T, BinaryOperation,
   template <typename _T = T, class _BinaryOperation = BinaryOperation>
   enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
               (is_geninteger32bit<T>::value || is_geninteger64bit<T>::value) &&
-              std::is_same<_BinaryOperation, intel::plus<T>>::value>
+              IsReduPlus<T, _BinaryOperation>::value>
   atomic_combine(_T *ReduVarPtr) const {
     atomic<T, access::address_space::global_space>(global_ptr<T>(ReduVarPtr))
         .fetch_add(MValue);
@@ -238,7 +282,7 @@ class reducer<T, BinaryOperation,
   template <typename _T = T, class _BinaryOperation = BinaryOperation>
   enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
               (is_geninteger32bit<T>::value || is_geninteger64bit<T>::value) &&
-              std::is_same<_BinaryOperation, intel::bit_or<T>>::value>
+              IsReduBitOR<T, _BinaryOperation>::value>
   atomic_combine(_T *ReduVarPtr) const {
     atomic<T, access::address_space::global_space>(global_ptr<T>(ReduVarPtr))
         .fetch_or(MValue);
@@ -248,7 +292,7 @@ class reducer<T, BinaryOperation,
   template <typename _T = T, class _BinaryOperation = BinaryOperation>
   enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
               (is_geninteger32bit<T>::value || is_geninteger64bit<T>::value) &&
-              std::is_same<_BinaryOperation, intel::bit_xor<T>>::value>
+              IsReduBitXOR<T, _BinaryOperation>::value>
   atomic_combine(_T *ReduVarPtr) const {
     atomic<T, access::address_space::global_space>(global_ptr<T>(ReduVarPtr))
         .fetch_xor(MValue);
@@ -258,7 +302,7 @@ class reducer<T, BinaryOperation,
   template <typename _T = T, class _BinaryOperation = BinaryOperation>
   enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
               (is_geninteger32bit<T>::value || is_geninteger64bit<T>::value) &&
-              std::is_same<_BinaryOperation, intel::bit_and<T>>::value>
+              IsReduBitAND<T, _BinaryOperation>::value>
   atomic_combine(_T *ReduVarPtr) const {
     atomic<T, access::address_space::global_space>(global_ptr<T>(ReduVarPtr))
         .fetch_and(MValue);
@@ -268,7 +312,7 @@ class reducer<T, BinaryOperation,
   template <typename _T = T, class _BinaryOperation = BinaryOperation>
   enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
               (is_geninteger32bit<T>::value || is_geninteger64bit<T>::value) &&
-              std::is_same<_BinaryOperation, intel::minimum<T>>::value>
+              IsReduMinimum<T, _BinaryOperation>::value>
   atomic_combine(_T *ReduVarPtr) const {
     atomic<T, access::address_space::global_space>(global_ptr<T>(ReduVarPtr))
         .fetch_min(MValue);
@@ -278,7 +322,7 @@ class reducer<T, BinaryOperation,
   template <typename _T = T, class _BinaryOperation = BinaryOperation>
   enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
               (is_geninteger32bit<T>::value || is_geninteger64bit<T>::value) &&
-              std::is_same<_BinaryOperation, intel::maximum<T>>::value>
+              IsReduMaximum<T, _BinaryOperation>::value>
   atomic_combine(_T *ReduVarPtr) const {
     atomic<T, access::address_space::global_space>(global_ptr<T>(ReduVarPtr))
         .fetch_max(MValue);
@@ -349,17 +393,20 @@ class reduction_impl {
       typename _T = T, class _BinaryOperation = BinaryOperation,
       enable_if_t<IsKnownIdentityOp<_T, _BinaryOperation>::value> * = nullptr>
   reduction_impl(accessor_type &Acc, const T &Identity)
-      : MAcc(Acc), MIdentity(Identity) {
+      : MAcc(Acc), MIdentity(getIdentity()) {
     assert(Acc.get_count() == 1 &&
            "Only scalar/1-element reductions are supported now.");
-    // For operations with known identity value the operator == is defined.
-    // It is sort of dilemma here: from one point of view - user may set
-    // such identity that would be enough for his data, i.e. identity=100 for
-    // min operation if user knows all data elements are less than 100.
-    // From another point of view - it is the source of unexpected errors,
-    // when the input data changes.
-    // Let's be strict for now and emit an error if identity is not proper.
-    assert(Identity == getIdentity() && "Unexpected Identity parameter value.");
+    // For now the implementation ignores the identity value given by user
+    // when the implementation knows the identity.
+    // The SPEC could prohibit passing identity parameter to operations with
+    // known identity, but that could have some bad consequences too.
+    // For example, at some moment the implementation may NOT know the identity
+    // for COMPLEX-PLUS reduction. User may create a program that would pass
+    // COMPLEX value (0,0) as identity for PLUS reduction. At some later moment
+    // when the implementation starts handling COMPLEX-PLUS as known operation
+    // the existing user's program remains compilable and working correctly.
+    // I.e. with this constructor here, adding more reduction operations to the
+    // list of known operations does not break the existing programs.
   }
 
   /// Constructs reduction_impl when the identity value is unknown.
@@ -577,7 +624,8 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
 
       typename Reduction::binary_operation BOp;
       size_t GID = NDIt.get_global_linear_id();
-      auto Val = (GID < NWorkItems) ? Reducer.MValue : Reducer.getIdentity();
+      typename Reduction::result_type Val =
+          (GID < NWorkItems) ? Reducer.MValue : Reducer.getIdentity();
       Reducer.MValue = intel::reduce(NDIt.get_group(), Val, BOp);
       if (NDIt.get_local_linear_id() == 0)
         Reducer.atomic_combine(Out.get_pointer().get());
@@ -654,7 +702,7 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
       size_t LID = NDIt.get_local_linear_id();
       size_t GID = NDIt.get_global_linear_id();
       // Copy the element to local memory to prepare it for tree-reduction.
-      auto ReduIdentity = Reducer.getIdentity();
+      typename Reduction::result_type ReduIdentity = Reducer.getIdentity();
       LocalReds[LID] = (GID < NWorkItems) ? Reducer.MValue : ReduIdentity;
       LocalReds[WGSize] = ReduIdentity;
       NDIt.barrier();
@@ -715,10 +763,13 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
       // Compute the partial sum/reduction for the work-group.
       typename Reduction::binary_operation BOp;
       size_t WGID = NDIt.get_group_linear_id();
-      auto V = intel::reduce(NDIt.get_group(), Reducer.MValue, BOp);
-      if (NDIt.get_local_linear_id() == 0)
-        Out.get_pointer().get()[WGID] =
-            IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V;
+      typename Reduction::result_type PSum =
+          intel::reduce(NDIt.get_group(), Reducer.MValue, BOp);
+      if (NDIt.get_local_linear_id() == 0) {
+        if (IsUpdateOfUserAcc)
+          PSum = BOp(*(Out.get_pointer()), PSum);
+        Out.get_pointer().get()[WGID] = PSum;
+      }
     });
   } else {
     // Inefficient case: non-uniform work-group require additional checks.
@@ -734,11 +785,14 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
       typename Reduction::binary_operation BOp;
       size_t GID = NDIt.get_global_linear_id();
       size_t WGID = NDIt.get_group_linear_id();
-      auto V = (GID < NWorkItems) ? Reducer.MValue : Reducer.getIdentity();
-      V = intel::reduce(NDIt.get_group(), V, BOp);
-      if (NDIt.get_local_linear_id() == 0)
-        Out.get_pointer().get()[WGID] =
-            IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V;
+      typename Reduction::result_type PSum =
+          (GID < NWorkItems) ? Reducer.MValue : Reducer.getIdentity();
+      PSum = intel::reduce(NDIt.get_group(), PSum, BOp);
+      if (NDIt.get_local_linear_id() == 0) {
+        if (IsUpdateOfUserAcc)
+          PSum = BOp(*(Out.get_pointer()), PSum);
+        Out.get_pointer().get()[WGID] = PSum;
+      }
     });
   }
 }
@@ -777,7 +831,7 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
   auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, CGH);
 
   auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, CGH);
-  auto ReduIdentity = Redu.getIdentity();
+  typename Reduction::result_type ReduIdentity = Redu.getIdentity();
   if (IsEfficientCase) {
     // Efficient case: work-groups are uniform and WGSize is is power of two.
     CGH.parallel_for<KernelName>(Range, [=](nd_item<Dims> NDIt) {
@@ -800,10 +854,12 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
       }
 
       // Compute the partial sum/reduction for the work-group.
-      if (LID == 0)
-        Out.get_pointer().get()[NDIt.get_group_linear_id()] =
-            IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), LocalReds[0])
-                              : LocalReds[0];
+      if (LID == 0) {
+        typename Reduction::result_type PSum = LocalReds[0];
+        if (IsUpdateOfUserAcc)
+          PSum = BOp(*(Out.get_pointer()), PSum);
+        Out.get_pointer().get()[NDIt.get_group_linear_id()] = PSum;
+      }
     });
   } else {
     // Inefficient case: work-groups are non uniform or WGSize is not power
@@ -844,9 +900,11 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
       // Compute the partial sum/reduction for the work-group.
       if (LID == 0) {
         size_t GrID = NDIt.get_group_linear_id();
-        auto V = BOp(LocalReds[0], LocalReds[WGSize]);
-        Out.get_pointer().get()[GrID] =
-            IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V;
+        typename Reduction::result_type PSum =
+            BOp(LocalReds[0], LocalReds[WGSize]);
+        if (IsUpdateOfUserAcc)
+          PSum = BOp(*(Out.get_pointer()), PSum);
+        Out.get_pointer().get()[GrID] = PSum;
       }
     });
   }
@@ -884,10 +942,13 @@ reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
       typename Reduction::binary_operation BOp;
       size_t WGID = NDIt.get_group_linear_id();
       size_t GID = NDIt.get_global_linear_id();
-      auto V = intel::reduce(NDIt.get_group(), In[GID], BOp);
-      if (NDIt.get_local_linear_id() == 0)
-        Out.get_pointer().get()[WGID] =
-            IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V;
+      typename Reduction::result_type PSum =
+          intel::reduce(NDIt.get_group(), In[GID], BOp);
+      if (NDIt.get_local_linear_id() == 0) {
+        if (IsUpdateOfUserAcc)
+          PSum = BOp(*(Out.get_pointer()), PSum);
+        Out.get_pointer().get()[WGID] = PSum;
+      }
     });
   } else {
     // Inefficient case: non-uniform work-groups require additional checks.
@@ -899,12 +960,14 @@ reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
       size_t WGID = NDIt.get_group_linear_id();
       size_t GID = NDIt.get_global_linear_id();
       typename Reduction::reducer_type Reducer;
-      auto V =
+      typename Reduction::result_type PSum =
           (GID < NWorkItems) ? In[GID] : Reduction::reducer_type::getIdentity();
-      V = intel::reduce(NDIt.get_group(), V, BOp);
-      if (NDIt.get_local_linear_id() == 0)
-        Out.get_pointer().get()[WGID] =
-            IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V;
+      PSum = intel::reduce(NDIt.get_group(), PSum, BOp);
+      if (NDIt.get_local_linear_id() == 0) {
+        if (IsUpdateOfUserAcc)
+          PSum = BOp(*(Out.get_pointer()), PSum);
+        Out.get_pointer().get()[WGID] = PSum;
+      }
     });
   }
 }
@@ -969,10 +1032,12 @@ reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
       }
 
       // Compute the partial sum/reduction for the work-group.
-      if (LID == 0)
-        Out.get_pointer().get()[NDIt.get_group_linear_id()] =
-            IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), LocalReds[0])
-                              : LocalReds[0];
+      if (LID == 0) {
+        typename Reduction::result_type PSum = LocalReds[0];
+        if (IsUpdateOfUserAcc)
+          PSum = BOp(*(Out.get_pointer()), PSum);
+        Out.get_pointer().get()[NDIt.get_group_linear_id()] = PSum;
+      }
     });
   } else {
     // Inefficient case: work-groups are not fully loaded
@@ -1010,9 +1075,11 @@ reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
       // Compute the partial sum/reduction for the work-group.
       if (LID == 0) {
         size_t GrID = NDIt.get_group_linear_id();
-        auto V = BOp(LocalReds[0], LocalReds[WGSize]);
-        Out.get_pointer().get()[GrID] =
-            IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V;
+        typename Reduction::result_type PSum =
+            BOp(LocalReds[0], LocalReds[WGSize]);
+        if (IsUpdateOfUserAcc)
+          PSum = BOp(*(Out.get_pointer()), PSum);
+        Out.get_pointer().get()[GrID] = PSum;
       }
     });
   }
diff --git a/sycl/test/reduction/reduction_ctor.cpp b/sycl/test/reduction/reduction_ctor.cpp
index 9e4fb306a95b9..7f8e8e9726e59 100644
--- a/sycl/test/reduction/reduction_ctor.cpp
+++ b/sycl/test/reduction/reduction_ctor.cpp
@@ -1,48 +1,15 @@
 // RUN: %clangxx -fsycl %s -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
-//==----------------reduction_ctor.cpp - SYCL reduction basic test ---------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
 
 // This performs basic checks such as reduction creation, getIdentity() method,
 // and the combine() method of the aux class 'reducer'.
 
+#include "reduction_utils.hpp"
 #include <CL/sycl.hpp>
 #include <cassert>
 
 using namespace cl::sycl;
 
-template <typename T, class BinaryOperation, int N>
-struct init_data_t {
-  void initInputData(T IdentityVal,
-                     buffer<T, 1> &InBuf,
-                     T &ExpectedReduValue) {
-    ExpectedReduValue = IdentityVal;
-    BinaryOperation Op;
-    auto In = InBuf.template get_access<access::mode::write>();
-    for (int I = 0; I < N; ++I) {
-      In[I] = ((I + 1) % 5) + 1;
-      ExpectedReduValue = Op(ExpectedReduValue, In[I]);
-    }
-  }
-};
-
-template <typename T, int N>
-struct init_data_t<T, std::multiplies<T>, N> {
-  void initInputData(T IdentityVal, buffer<T, 1> &InBuf, T &ExpectedReduValue) {
-    ExpectedReduValue = IdentityVal;
-    std::multiplies<T> Op;
-    auto In = InBuf.template get_access<access::mode::write>();
-    for (int I = 0; I < N; ++I) {
-      In[I] = 1 + (((I % 37) == 0) ? 1 : 0);
-      ExpectedReduValue = Op(ExpectedReduValue, In[I]);
-    }
-  }
-};
 
 template <typename T, typename Reduction>
 void test_reducer(Reduction &Redu, T A, T B) {
@@ -157,15 +124,13 @@ int main() {
   testBoth<int, intel::bit_or<int>>(0, 1, 8);
   testBoth<int, intel::bit_xor<int>>(0, 7, 3);
   testBoth<int, intel::bit_and<int>>(~0, 7, 3);
-  testBoth<int, intel::minimum<int>>(std::numeric_limits<int>::max(), 7, 3);
-  testBoth<int, intel::maximum<int>>(std::numeric_limits<int>::min(), 7, 3);
+  testBoth<int, intel::minimum<int>>((std::numeric_limits<int>::max)(), 7, 3);
+  testBoth<int, intel::maximum<int>>((std::numeric_limits<int>::min)(), 7, 3);
 
   testBoth<float, intel::plus<float>>(0, 1, 7);
   testBoth<float, std::multiplies<float>>(1, 1, 7);
-  testBoth<float, intel::minimum<float>>(
-      std::numeric_limits<float>::max(), 7, 3);
-  testBoth<float, intel::maximum<float>>(
-      std::numeric_limits<float>::min(), 7, 3);
+  testBoth<float, intel::minimum<float>>(getMaximumFPValue<float>(), 7, 3);
+  testBoth<float, intel::maximum<float>>(getMinimumFPValue<float>(), 7, 3);
 
   testUnknown<Point<float>, 0, PointPlus<float>>(Point<float>(0), Point<float>(1), Point<float>(7));
   testUnknown<Point<float>, 1, PointPlus<float>>(Point<float>(0), Point<float>(1), Point<float>(7));
diff --git a/sycl/test/reduction/reduction_nd_s0_dw.cpp b/sycl/test/reduction/reduction_nd_s0_dw.cpp
index 4160efcb5afba..61b6e10481546 100644
--- a/sycl/test/reduction/reduction_nd_s0_dw.cpp
+++ b/sycl/test/reduction/reduction_nd_s0_dw.cpp
@@ -79,17 +79,17 @@ int main() {
   test<int, 0, intel::bit_or<int>>(0, 8, 256);
   test<int, 0, intel::bit_xor<int>>(0, 8, 256);
   test<int, 0, intel::bit_and<int>>(~0, 8, 256);
-  test<int, 0, intel::minimum<int>>(std::numeric_limits<int>::max(), 8, 256);
-  test<int, 0, intel::maximum<int>>(std::numeric_limits<int>::min(), 8, 256);
+  test<int, 0, intel::minimum<int>>((std::numeric_limits<int>::max)(), 8, 256);
+  test<int, 0, intel::maximum<int>>((std::numeric_limits<int>::min)(), 8, 256);
 
   // Check with various types.
   test<float, 0, std::multiplies<float>>(1, 8, 256);
-  test<float, 0, intel::minimum<float>>(std::numeric_limits<float>::max(), 8, 256);
-  test<float, 0, intel::maximum<float>>(std::numeric_limits<float>::min(), 8, 256);
+  test<float, 0, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
+  test<float, 0, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
 
   test<double, 0, std::multiplies<double>>(1, 8, 256);
-  test<double, 0, intel::minimum<double>>(std::numeric_limits<double>::max(), 8, 256);
-  test<double, 0, intel::maximum<double>>(std::numeric_limits<double>::min(), 8, 256);
+  test<double, 0, intel::minimum<double>>(getMaximumFPValue<double>(), 8, 256);
+  test<double, 0, intel::maximum<double>>(getMinimumFPValue<double>(), 8, 256);
 
   // Check with CUSTOM type.
   test<CustomVec<long long>, 0, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
diff --git a/sycl/test/reduction/reduction_nd_s0_rw.cpp b/sycl/test/reduction/reduction_nd_s0_rw.cpp
index b16eee70656cd..bdcaad8433911 100644
--- a/sycl/test/reduction/reduction_nd_s0_rw.cpp
+++ b/sycl/test/reduction/reduction_nd_s0_rw.cpp
@@ -81,17 +81,17 @@ int main() {
   test<int, 0, intel::bit_or<int>>(0, 8, 256);
   test<int, 0, intel::bit_xor<int>>(0, 8, 256);
   test<int, 0, intel::bit_and<int>>(~0, 8, 256);
-  test<int, 0, intel::minimum<int>>(std::numeric_limits<int>::max(), 8, 256);
-  test<int, 0, intel::maximum<int>>(std::numeric_limits<int>::min(), 8, 256);
+  test<int, 0, intel::minimum<int>>((std::numeric_limits<int>::max)(), 8, 256);
+  test<int, 0, intel::maximum<int>>((std::numeric_limits<int>::min)(), 8, 256);
 
   // Check with various types.
   test<float, 0, std::multiplies<float>>(1, 8, 256);
-  test<float, 0, intel::minimum<float>>(std::numeric_limits<float>::max(), 8, 256);
-  test<float, 0, intel::maximum<float>>(std::numeric_limits<float>::min(), 8, 256);
+  test<float, 0, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
+  test<float, 0, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
 
   test<double, 0, std::multiplies<double>>(1, 8, 256);
-  test<double, 0, intel::minimum<double>>(std::numeric_limits<double>::max(), 8, 256);
-  test<double, 0, intel::maximum<double>>(std::numeric_limits<double>::min(), 8, 256);
+  test<double, 0, intel::minimum<double>>(getMaximumFPValue<double>(), 8, 256);
+  test<double, 0, intel::maximum<double>>(getMinimumFPValue<double>(), 8, 256);
 
   // Check with CUSTOM type.
   test<CustomVec<long long>, 0, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
diff --git a/sycl/test/reduction/reduction_nd_s1_dw.cpp b/sycl/test/reduction/reduction_nd_s1_dw.cpp
index eef8413a76a32..981d4e77f5e33 100644
--- a/sycl/test/reduction/reduction_nd_s1_dw.cpp
+++ b/sycl/test/reduction/reduction_nd_s1_dw.cpp
@@ -80,17 +80,17 @@ int main() {
   test<int, 1, intel::bit_or<int>>(0, 8, 256);
   test<int, 1, intel::bit_xor<int>>(0, 8, 256);
   test<int, 1, intel::bit_and<int>>(~0, 8, 256);
-  test<int, 1, intel::minimum<int>>(std::numeric_limits<int>::max(), 8, 256);
-  test<int, 1, intel::maximum<int>>(std::numeric_limits<int>::min(), 8, 256);
+  test<int, 1, intel::minimum<int>>((std::numeric_limits<int>::max)(), 8, 256);
+  test<int, 1, intel::maximum<int>>((std::numeric_limits<int>::min)(), 8, 256);
 
   // Check with various types.
   test<float, 1, std::multiplies<float>>(1, 8, 256);
-  test<float, 1, intel::minimum<float>>(std::numeric_limits<float>::max(), 8, 256);
-  test<float, 1, intel::maximum<float>>(std::numeric_limits<float>::min(), 8, 256);
+  test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
+  test<float, 1, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
 
   test<double, 1, std::multiplies<double>>(1, 8, 256);
-  test<double, 1, intel::minimum<double>>(std::numeric_limits<double>::max(), 8, 256);
-  test<double, 1, intel::maximum<double>>(std::numeric_limits<double>::min(), 8, 256);
+  test<double, 1, intel::minimum<double>>(getMaximumFPValue<double>(), 8, 256);
+  test<double, 1, intel::maximum<double>>(getMinimumFPValue<double>(), 8, 256);
 
   // Check with CUSTOM type.
   test<CustomVec<long long>, 1, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
diff --git a/sycl/test/reduction/reduction_nd_s1_rw.cpp b/sycl/test/reduction/reduction_nd_s1_rw.cpp
index 6c69c4d04f12d..28ec3989273ae 100644
--- a/sycl/test/reduction/reduction_nd_s1_rw.cpp
+++ b/sycl/test/reduction/reduction_nd_s1_rw.cpp
@@ -82,17 +82,17 @@ int main() {
   test<int, 1, intel::bit_or<int>>(0, 8, 256);
   test<int, 1, intel::bit_xor<int>>(0, 8, 256);
   test<int, 1, intel::bit_and<int>>(~0, 8, 256);
-  test<int, 1, intel::minimum<int>>(std::numeric_limits<int>::max(), 8, 256);
-  test<int, 1, intel::maximum<int>>(std::numeric_limits<int>::min(), 8, 256);
+  test<int, 1, intel::minimum<int>>((std::numeric_limits<int>::max)(), 8, 256);
+  test<int, 1, intel::maximum<int>>((std::numeric_limits<int>::min)(), 8, 256);
 
   // Check with various types.
   test<float, 1, std::multiplies<float>>(1, 8, 256);
-  test<float, 1, intel::minimum<float>>(std::numeric_limits<float>::max(), 8, 256);
-  test<float, 1, intel::maximum<float>>(std::numeric_limits<float>::min(), 8, 256);
+  test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
+  test<float, 1, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
 
   test<double, 1, std::multiplies<double>>(1, 8, 256);
-  test<double, 1, intel::minimum<double>>(std::numeric_limits<double>::max(), 8, 256);
-  test<double, 1, intel::maximum<double>>(std::numeric_limits<double>::min(), 8, 256);
+  test<double, 1, intel::minimum<double>>(getMaximumFPValue<double>(), 8, 256);
+  test<double, 1, intel::maximum<double>>(getMinimumFPValue<double>(), 8, 256);
 
   // Check with CUSTOM type.
   test<CustomVec<long long>, 1, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
diff --git a/sycl/test/reduction/reduction_placeholder.cpp b/sycl/test/reduction/reduction_placeholder.cpp
index 3b2ff95fb219b..e972105bbab50 100644
--- a/sycl/test/reduction/reduction_placeholder.cpp
+++ b/sycl/test/reduction/reduction_placeholder.cpp
@@ -72,8 +72,8 @@ int main() {
   test<int, 1, intel::bit_or<int>>(0, 4, 128);
 
   // fast reduce
-  test<float, 1, intel::minimum<float>>(std::numeric_limits<float>::max(), 5, 5 * 7);
-  test<float, 0, intel::maximum<float>>(std::numeric_limits<float>::min(), 4, 128);
+  test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 5, 5 * 7);
+  test<float, 0, intel::maximum<float>>(getMinimumFPValue<float>(), 4, 128);
 
   // generic algorithm
   test<int, 0, std::multiplies<int>>(1, 7, 7 * 5);
diff --git a/sycl/test/reduction/reduction_transparent.cpp b/sycl/test/reduction/reduction_transparent.cpp
index d9392fc2784a4..1363312c812b7 100644
--- a/sycl/test/reduction/reduction_transparent.cpp
+++ b/sycl/test/reduction/reduction_transparent.cpp
@@ -1,8 +1,17 @@
+// UNSUPPORTED: cuda
+// Reductions use work-group builtins not yet supported by CUDA.
+
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
-// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// TODO: enable all checks for CPU/ACC when CPU/ACC RT supports intel::reduce()
+// for 'cl::sycl::half' type.
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DSKIP_FOR_HALF -o %t.no_half.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.no_half.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.no_half.out
+
+// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
+// TODO: Enable the test for HOST when it supports intel::reduce() and barrier()
 
 // This test performs basic checks of parallel_for(nd_range, reduction, func)
 // where func is a transparent functor.
@@ -14,10 +23,14 @@
 using namespace cl::sycl;
 
 template <typename T, int Dim, class BinaryOperation>
-class SomeClass;
+class SomeIdClass;
+template <typename T, int Dim, class BinaryOperation>
+class SomeNoIdClass;
 
+// Checks reductions initialized with transparent functor and explicitly set
+// identity value.
 template <typename T, int Dim, class BinaryOperation>
-void test(T Identity, size_t WGSize, size_t NWItems) {
+void testId(T Identity, size_t WGSize, size_t NWItems) {
   buffer<T, 1> InBuf(NWItems);
   buffer<T, 1> OutBuf(1);
 
@@ -32,13 +45,12 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
     auto In = InBuf.template get_access<access::mode::read>(CGH);
     accessor<T, Dim, access::mode::discard_write, access::target::global_buffer>
         Out(OutBuf, CGH);
-    auto Redu = intel::reduction(Out, Identity, BOp);
 
     range<1> GlobalRange(NWItems);
     range<1> LocalRange(WGSize);
     nd_range<1> NDRange(GlobalRange, LocalRange);
-    CGH.parallel_for<SomeClass<T, Dim, BinaryOperation>>(
-        NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) {
+    CGH.parallel_for<SomeIdClass<T, Dim, BinaryOperation>>(
+        NDRange, intel::reduction(Out, Identity, BOp), [=](nd_item<1> NDIt, auto &Sum) {
           Sum.combine(In[NDIt.get_global_linear_id()]);
         });
   });
@@ -54,12 +66,62 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
   }
 }
 
+// Checks reductions initialized with transparent functor and identity
+// value not explicitly specified. The parameter 'Identity' is passed here
+// only to pre-initialize input data correctly.
+template <typename T, int Dim, class BinaryOperation>
+void testNoId(T Identity, size_t WGSize, size_t NWItems) {
+  buffer<T, 1> InBuf(NWItems);
+  buffer<T, 1> OutBuf(1);
+
+  // Initialize.
+  BinaryOperation BOp;
+  T CorrectOut;
+  initInputData(InBuf, CorrectOut, Identity, BOp, NWItems);
+
+  // Compute.
+  queue Q;
+  Q.submit([&](handler &CGH) {
+    auto In = InBuf.template get_access<access::mode::read>(CGH);
+    accessor<T, Dim, access::mode::discard_write, access::target::global_buffer>
+        Out(OutBuf, CGH);
+
+    range<1> GlobalRange(NWItems);
+    range<1> LocalRange(WGSize);
+    nd_range<1> NDRange(GlobalRange, LocalRange);
+    CGH.parallel_for<SomeNoIdClass<T, Dim, BinaryOperation>>(
+        NDRange, intel::reduction(Out, BOp), [=](nd_item<1> NDIt, auto &Sum) {
+          Sum.combine(In[NDIt.get_global_linear_id()]);
+        });
+  });
+
+  // Check correctness.
+  auto Out = OutBuf.template get_access<access::mode::read>();
+  T ComputedOut = *(Out.get_pointer());
+  if (ComputedOut != CorrectOut) {
+    std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n";
+    std::cout << "Computed value: " << ComputedOut
+              << ", Expected value: " << CorrectOut << "\n";
+    assert(0 && "Wrong value.");
+  }
+}
+
+template <typename T, int Dim, class BinaryOperation>
+void test(T Identity, size_t WGSize, size_t NWItems) {
+  testId<T, Dim, BinaryOperation>(Identity, WGSize, NWItems);
+  testNoId<T, Dim, BinaryOperation>(Identity, WGSize, NWItems);
+}
+
 int main() {
 #if __cplusplus >= 201402L
-  test<double, 0, intel::maximum<>>(std::numeric_limits<double>::min(), 7, 7 * 5);
+  test<double, 0, intel::maximum<>>(getMinimumFPValue<double>(), 7, 7 * 5);
   test<signed char, 0, intel::plus<>>(0, 7, 49);
   test<unsigned char, 1, std::multiplies<>>(1, 4, 16);
-#endif
+#ifndef SKIP_FOR_HALF
+  test<half, 1, intel::plus<>>(0, 4, 8);
+  test<half, 1, intel::minimum<>>(getMaximumFPValue<half>(), 8, 32);
+#endif // SKIP_FOR_HALF
+#endif // __cplusplus >= 201402L
 
   std::cout << "Test passed\n";
   return 0;
diff --git a/sycl/test/reduction/reduction_usm.cpp b/sycl/test/reduction/reduction_usm.cpp
index bb99f372818d9..592a36904a8e8 100644
--- a/sycl/test/reduction/reduction_usm.cpp
+++ b/sycl/test/reduction/reduction_usm.cpp
@@ -113,10 +113,8 @@ int main() {
   testUSM<int, 1, intel::bit_or<int>>(0, 4, 128);
 
   // fast reduce
-  testUSM<float, 1, intel::minimum<float>>(
-      (std::numeric_limits<float>::max)(), 5, 5 * 7);
-  testUSM<float, 0, intel::maximum<float>>(
-      (std::numeric_limits<float>::min)(), 4, 128);
+  testUSM<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 5, 5 * 7);
+  testUSM<float, 0, intel::maximum<float>>(getMinimumFPValue<float>(), 4, 128);
 
   // generic algorithm
   testUSM<int, 0, std::multiplies<int>>(1, 7, 7 * 5);
diff --git a/sycl/test/reduction/reduction_utils.hpp b/sycl/test/reduction/reduction_utils.hpp
index c7a1699298f02..d09bae11e6e6e 100644
--- a/sycl/test/reduction/reduction_utils.hpp
+++ b/sycl/test/reduction/reduction_utils.hpp
@@ -52,3 +52,17 @@ struct CustomVecPlus {
     return CV(A.X + B.X, A.Y + B.Y);
   }
 };
+
+template <typename T>
+T getMinimumFPValue() {
+  return std::numeric_limits<T>::has_infinity
+             ? -std::numeric_limits<T>::infinity()
+             : std::numeric_limits<T>::lowest();
+}
+
+template <typename T>
+T getMaximumFPValue() {
+  return std::numeric_limits<T>::has_infinity
+             ? std::numeric_limits<T>::infinity()
+             : (std::numeric_limits<T>::max)();
+}