diff --git a/sycl/include/CL/sycl.hpp b/sycl/include/CL/sycl.hpp
index 27b3d3d6c137f..83beb030b9a49 100644
--- a/sycl/include/CL/sycl.hpp
+++ b/sycl/include/CL/sycl.hpp
@@ -26,6 +26,7 @@
 #include <CL/sycl/intel/builtins.hpp>
 #include <CL/sycl/intel/function_pointer.hpp>
 #include <CL/sycl/intel/group_algorithm.hpp>
+#include <CL/sycl/intel/reduction.hpp>
 #include <CL/sycl/intel/sub_group.hpp>
 #include <CL/sycl/item.hpp>
 #include <CL/sycl/kernel.hpp>
diff --git a/sycl/include/CL/sycl/intel/functional.hpp b/sycl/include/CL/sycl/intel/functional.hpp
index 842adbf96b3be..f54e3777d3375 100644
--- a/sycl/include/CL/sycl/intel/functional.hpp
+++ b/sycl/include/CL/sycl/intel/functional.hpp
@@ -53,6 +53,9 @@ template <> struct maximum<void> {
 #endif
 
 template <typename T = void> using plus = std::plus<T>;
+template <typename T = void> using bit_or = std::bit_or<T>;
+template <typename T = void> using bit_xor = std::bit_xor<T>;
+template <typename T = void> using bit_and = std::bit_and<T>;
 
 } // namespace intel
 
diff --git a/sycl/include/CL/sycl/intel/group_algorithm.hpp b/sycl/include/CL/sycl/intel/group_algorithm.hpp
index 86304709c7e23..5a11d23812cd7 100644
--- a/sycl/include/CL/sycl/intel/group_algorithm.hpp
+++ b/sycl/include/CL/sycl/intel/group_algorithm.hpp
@@ -108,8 +108,8 @@ template <typename T, typename V> struct identity<T, intel::maximum<V>> {
 template <typename Group, typename Ptr, class Function>
 Function for_each(Group g, Ptr first, Ptr last, Function f) {
 #ifdef __SYCL_DEVICE_ONLY__
-  ptrdiff_t offset = detail::get_local_linear_id(g);
-  ptrdiff_t stride = detail::get_local_linear_range(g);
+  ptrdiff_t offset = sycl::detail::get_local_linear_id(g);
+  ptrdiff_t stride = sycl::detail::get_local_linear_range(g);
   for (Ptr p = first + offset; p < last; p += stride) {
     f(*p);
   }
@@ -137,11 +137,11 @@ using EnableIfIsPointer =
     cl::sycl::detail::enable_if_t<cl::sycl::detail::is_pointer<Ptr>::value, T>;
 
 template <typename Group> bool all_of(Group g, bool pred) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
 #ifdef __SYCL_DEVICE_ONLY__
-  return detail::spirv::GroupAll<Group>(pred);
+  return sycl::detail::spirv::GroupAll<Group>(pred);
 #else
   throw runtime_error("Group algorithms are not supported on host device.",
                       PI_INVALID_DEVICE);
@@ -150,7 +150,7 @@ template <typename Group> bool all_of(Group g, bool pred) {
 
 template <typename Group, typename T, class Predicate>
 bool all_of(Group g, T x, Predicate pred) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
   return all_of(g, pred(x));
@@ -159,14 +159,14 @@ bool all_of(Group g, T x, Predicate pred) {
 template <typename Group, typename Ptr, class Predicate>
 EnableIfIsPointer<Ptr, bool> all_of(Group g, Ptr first, Ptr last,
                                     Predicate pred) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
 #ifdef __SYCL_DEVICE_ONLY__
   bool partial = true;
-  detail::for_each(g, first, last, [&](const typename Ptr::element_type &x) {
-    partial &= pred(x);
-  });
+  sycl::detail::for_each(
+      g, first, last,
+      [&](const typename Ptr::element_type &x) { partial &= pred(x); });
   return all_of(g, partial);
 #else
   throw runtime_error("Group algorithms are not supported on host device.",
@@ -175,11 +175,11 @@ EnableIfIsPointer<Ptr, bool> all_of(Group g, Ptr first, Ptr last,
 }
 
 template <typename Group> bool any_of(Group g, bool pred) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
 #ifdef __SYCL_DEVICE_ONLY__
-  return detail::spirv::GroupAny<Group>(pred);
+  return sycl::detail::spirv::GroupAny<Group>(pred);
 #else
   throw runtime_error("Group algorithms are not supported on host device.",
                       PI_INVALID_DEVICE);
@@ -188,7 +188,7 @@ template <typename Group> bool any_of(Group g, bool pred) {
 
 template <typename Group, typename T, class Predicate>
 bool any_of(Group g, T x, Predicate pred) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
   return any_of(g, pred(x));
@@ -198,13 +198,13 @@ template <typename Group, typename Ptr, class Predicate>
 EnableIfIsPointer<Ptr, bool> any_of(Group g, Ptr first, Ptr last,
                                     Predicate pred) {
 #ifdef __SYCL_DEVICE_ONLY__
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
   bool partial = false;
-  detail::for_each(g, first, last, [&](const typename Ptr::element_type &x) {
-    partial |= pred(x);
-  });
+  sycl::detail::for_each(
+      g, first, last,
+      [&](const typename Ptr::element_type &x) { partial |= pred(x); });
   return any_of(g, partial);
 #else
   throw runtime_error("Group algorithms are not supported on host device.",
@@ -213,11 +213,11 @@ EnableIfIsPointer<Ptr, bool> any_of(Group g, Ptr first, Ptr last,
 }
 
 template <typename Group> bool none_of(Group g, bool pred) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
 #ifdef __SYCL_DEVICE_ONLY__
-  return detail::spirv::GroupAll<Group>(!pred);
+  return sycl::detail::spirv::GroupAll<Group>(!pred);
 #else
   throw runtime_error("Group algorithms are not supported on host device.",
                       PI_INVALID_DEVICE);
@@ -226,7 +226,7 @@ template <typename Group> bool none_of(Group g, bool pred) {
 
 template <typename Group, typename T, class Predicate>
 bool none_of(Group g, T x, Predicate pred) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
   return none_of(g, pred(x));
@@ -236,7 +236,7 @@ template <typename Group, typename Ptr, class Predicate>
 EnableIfIsPointer<Ptr, bool> none_of(Group g, Ptr first, Ptr last,
                                      Predicate pred) {
 #ifdef __SYCL_DEVICE_ONLY__
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
   return !any_of(g, first, last, pred);
@@ -249,11 +249,11 @@ EnableIfIsPointer<Ptr, bool> none_of(Group g, Ptr first, Ptr last,
 template <typename Group, typename T>
 EnableIfIsScalarArithmetic<T> broadcast(Group g, T x,
                                         typename Group::id_type local_id) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
 #ifdef __SYCL_DEVICE_ONLY__
-  return detail::spirv::GroupBroadcast<Group>(x, local_id);
+  return sycl::detail::spirv::GroupBroadcast<Group>(x, local_id);
 #else
   throw runtime_error("Group algorithms are not supported on host device.",
                       PI_INVALID_DEVICE);
@@ -263,7 +263,7 @@ EnableIfIsScalarArithmetic<T> broadcast(Group g, T x,
 template <typename Group, typename T>
 EnableIfIsVectorArithmetic<T> broadcast(Group g, T x,
                                         typename Group::id_type local_id) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
 #ifdef __SYCL_DEVICE_ONLY__
@@ -281,12 +281,13 @@ EnableIfIsVectorArithmetic<T> broadcast(Group g, T x,
 template <typename Group, typename T>
 EnableIfIsScalarArithmetic<T>
 broadcast(Group g, T x, typename Group::linear_id_type linear_local_id) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
 #ifdef __SYCL_DEVICE_ONLY__
   return broadcast(
-      g, x, detail::linear_id_to_id(g.get_local_range(), linear_local_id));
+      g, x,
+      sycl::detail::linear_id_to_id(g.get_local_range(), linear_local_id));
 #else
   throw runtime_error("Group algorithms are not supported on host device.",
                       PI_INVALID_DEVICE);
@@ -296,7 +297,7 @@ broadcast(Group g, T x, typename Group::linear_id_type linear_local_id) {
 template <typename Group, typename T>
 EnableIfIsVectorArithmetic<T>
 broadcast(Group g, T x, typename Group::linear_id_type linear_local_id) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
 #ifdef __SYCL_DEVICE_ONLY__
@@ -313,7 +314,7 @@ broadcast(Group g, T x, typename Group::linear_id_type linear_local_id) {
 
 template <typename Group, typename T>
 EnableIfIsScalarArithmetic<T> broadcast(Group g, T x) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
 #ifdef __SYCL_DEVICE_ONLY__
@@ -326,7 +327,7 @@ EnableIfIsScalarArithmetic<T> broadcast(Group g, T x) {
 
 template <typename Group, typename T>
 EnableIfIsVectorArithmetic<T> broadcast(Group g, T x) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
 #ifdef __SYCL_DEVICE_ONLY__
@@ -343,7 +344,7 @@ EnableIfIsVectorArithmetic<T> broadcast(Group g, T x) {
 
 template <typename Group, typename T, class BinaryOperation>
 EnableIfIsScalarArithmetic<T> reduce(Group g, T x, BinaryOperation binary_op) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
   // FIXME: Do not special-case for half precision
@@ -353,9 +354,9 @@ EnableIfIsScalarArithmetic<T> reduce(Group g, T x, BinaryOperation binary_op) {
            std::is_same<decltype(binary_op(x, x)), float>::value),
       "Result type of binary_op must match reduction accumulation type.");
 #ifdef __SYCL_DEVICE_ONLY__
-  return detail::calc<T, __spv::GroupOperation::Reduce,
-                      detail::spirv::group_scope<Group>::value>(
-      typename detail::GroupOpTag<T>::type(), x, binary_op);
+  return sycl::detail::calc<T, __spv::GroupOperation::Reduce,
+                            sycl::detail::spirv::group_scope<Group>::value>(
+      typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
 #else
   throw runtime_error("Group algorithms are not supported on host device.",
                       PI_INVALID_DEVICE);
@@ -364,7 +365,7 @@ EnableIfIsScalarArithmetic<T> reduce(Group g, T x, BinaryOperation binary_op) {
 
 template <typename Group, typename T, class BinaryOperation>
 EnableIfIsVectorArithmetic<T> reduce(Group g, T x, BinaryOperation binary_op) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
   // FIXME: Do not special-case for half precision
@@ -384,7 +385,7 @@ EnableIfIsVectorArithmetic<T> reduce(Group g, T x, BinaryOperation binary_op) {
 template <typename Group, typename V, typename T, class BinaryOperation>
 EnableIfIsScalarArithmetic<T> reduce(Group g, V x, T init,
                                      BinaryOperation binary_op) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
   // FIXME: Do not special-case for half precision
@@ -404,7 +405,7 @@ EnableIfIsScalarArithmetic<T> reduce(Group g, V x, T init,
 template <typename Group, typename V, typename T, class BinaryOperation>
 EnableIfIsVectorArithmetic<T> reduce(Group g, V x, T init,
                                      BinaryOperation binary_op) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
   // FIXME: Do not special-case for half precision
@@ -429,7 +430,7 @@ EnableIfIsVectorArithmetic<T> reduce(Group g, V x, T init,
 template <typename Group, typename Ptr, class BinaryOperation>
 EnableIfIsPointer<Ptr, typename Ptr::element_type>
 reduce(Group g, Ptr first, Ptr last, BinaryOperation binary_op) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
   // FIXME: Do not special-case for half precision
@@ -441,10 +442,12 @@ reduce(Group g, Ptr first, Ptr last, BinaryOperation binary_op) {
       "Result type of binary_op must match reduction accumulation type.");
 #ifdef __SYCL_DEVICE_ONLY__
   typename Ptr::element_type partial =
-      detail::identity<typename Ptr::element_type, BinaryOperation>::value;
-  detail::for_each(g, first, last, [&](const typename Ptr::element_type &x) {
-    partial = binary_op(partial, x);
-  });
+      sycl::detail::identity<typename Ptr::element_type,
+                             BinaryOperation>::value;
+  sycl::detail::for_each(g, first, last,
+                         [&](const typename Ptr::element_type &x) {
+                           partial = binary_op(partial, x);
+                         });
   return reduce(g, partial, binary_op);
 #else
   throw runtime_error("Group algorithms are not supported on host device.",
@@ -455,7 +458,7 @@ reduce(Group g, Ptr first, Ptr last, BinaryOperation binary_op) {
 template <typename Group, typename Ptr, typename T, class BinaryOperation>
 EnableIfIsPointer<Ptr, T> reduce(Group g, Ptr first, Ptr last, T init,
                                  BinaryOperation binary_op) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
   // FIXME: Do not special-case for half precision
@@ -465,11 +468,12 @@ EnableIfIsPointer<Ptr, T> reduce(Group g, Ptr first, Ptr last, T init,
            std::is_same<decltype(binary_op(init, *first)), float>::value),
       "Result type of binary_op must match reduction accumulation type.");
 #ifdef __SYCL_DEVICE_ONLY__
-  T partial =
-      detail::identity<typename Ptr::element_type, BinaryOperation>::value;
-  detail::for_each(g, first, last, [&](const typename Ptr::element_type &x) {
-    partial = binary_op(partial, x);
-  });
+  T partial = sycl::detail::identity<typename Ptr::element_type,
+                                     BinaryOperation>::value;
+  sycl::detail::for_each(g, first, last,
+                         [&](const typename Ptr::element_type &x) {
+                           partial = binary_op(partial, x);
+                         });
   return reduce(g, partial, init, binary_op);
 #else
   throw runtime_error("Group algorithms are not supported on host device.",
@@ -480,7 +484,7 @@ EnableIfIsPointer<Ptr, T> reduce(Group g, Ptr first, Ptr last, T init,
 template <typename Group, typename T, class BinaryOperation>
 EnableIfIsScalarArithmetic<T> exclusive_scan(Group g, T x,
                                              BinaryOperation binary_op) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
   // FIXME: Do not special-case for half precision
@@ -489,9 +493,9 @@ EnableIfIsScalarArithmetic<T> exclusive_scan(Group g, T x,
                      std::is_same<decltype(binary_op(x, x)), float>::value),
                 "Result type of binary_op must match scan accumulation type.");
 #ifdef __SYCL_DEVICE_ONLY__
-  return detail::calc<T, __spv::GroupOperation::ExclusiveScan,
-                      detail::spirv::group_scope<Group>::value>(
-      typename detail::GroupOpTag<T>::type(), x, binary_op);
+  return sycl::detail::calc<T, __spv::GroupOperation::ExclusiveScan,
+                            sycl::detail::spirv::group_scope<Group>::value>(
+      typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
 #else
   throw runtime_error("Group algorithms are not supported on host device.",
                       PI_INVALID_DEVICE);
@@ -501,7 +505,7 @@ EnableIfIsScalarArithmetic<T> exclusive_scan(Group g, T x,
 template <typename Group, typename T, class BinaryOperation>
 EnableIfIsVectorArithmetic<T> exclusive_scan(Group g, T x,
                                              BinaryOperation binary_op) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
   // FIXME: Do not special-case for half precision
@@ -521,7 +525,7 @@ EnableIfIsVectorArithmetic<T> exclusive_scan(Group g, T x,
 template <typename Group, typename V, typename T, class BinaryOperation>
 EnableIfIsVectorArithmetic<T> exclusive_scan(Group g, V x, T init,
                                              BinaryOperation binary_op) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
   // FIXME: Do not special-case for half precision
@@ -541,7 +545,7 @@ EnableIfIsVectorArithmetic<T> exclusive_scan(Group g, V x, T init,
 template <typename Group, typename V, typename T, class BinaryOperation>
 EnableIfIsScalarArithmetic<T> exclusive_scan(Group g, V x, T init,
                                              BinaryOperation binary_op) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
   // FIXME: Do not special-case for half precision
@@ -551,7 +555,7 @@ EnableIfIsScalarArithmetic<T> exclusive_scan(Group g, V x, T init,
                 "Result type of binary_op must match scan accumulation type.");
 #ifdef __SYCL_DEVICE_ONLY__
   typename Group::linear_id_type local_linear_id =
-      detail::get_local_linear_id(g);
+      sycl::detail::get_local_linear_id(g);
   if (local_linear_id == 0) {
     x = binary_op(init, x);
   }
@@ -571,7 +575,7 @@ template <typename Group, typename InPtr, typename OutPtr, typename T,
 EnableIfIsPointer<InPtr, OutPtr>
 exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result, T init,
                BinaryOperation binary_op) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
   // FIXME: Do not special-case for half precision
@@ -581,8 +585,8 @@ exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result, T init,
            std::is_same<decltype(binary_op(*first, *first)), float>::value),
       "Result type of binary_op must match scan accumulation type.");
 #ifdef __SYCL_DEVICE_ONLY__
-  ptrdiff_t offset = detail::get_local_linear_id(g);
-  ptrdiff_t stride = detail::get_local_linear_range(g);
+  ptrdiff_t offset = sycl::detail::get_local_linear_id(g);
+  ptrdiff_t stride = sycl::detail::get_local_linear_range(g);
   ptrdiff_t N = last - first;
   auto roundup = [=](const ptrdiff_t &v,
                      const ptrdiff_t &divisor) -> ptrdiff_t {
@@ -620,16 +624,16 @@ EnableIfIsPointer<InPtr, OutPtr> exclusive_scan(Group g, InPtr first,
           (std::is_same<typename OutPtr::element_type, half>::value &&
            std::is_same<decltype(binary_op(*first, *first)), float>::value),
       "Result type of binary_op must match scan accumulation type.");
-  return exclusive_scan(
-      g, first, last, result,
-      detail::identity<typename OutPtr::element_type, BinaryOperation>::value,
-      binary_op);
+  return exclusive_scan(g, first, last, result,
+                        sycl::detail::identity<typename OutPtr::element_type,
+                                               BinaryOperation>::value,
+                        binary_op);
 }
 
 template <typename Group, typename T, class BinaryOperation>
 EnableIfIsVectorArithmetic<T> inclusive_scan(Group g, T x,
                                              BinaryOperation binary_op) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
   // FIXME: Do not special-case for half precision
@@ -649,7 +653,7 @@ EnableIfIsVectorArithmetic<T> inclusive_scan(Group g, T x,
 template <typename Group, typename T, class BinaryOperation>
 EnableIfIsScalarArithmetic<T> inclusive_scan(Group g, T x,
                                              BinaryOperation binary_op) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
   // FIXME: Do not special-case for half precision
@@ -658,9 +662,9 @@ EnableIfIsScalarArithmetic<T> inclusive_scan(Group g, T x,
                      std::is_same<decltype(binary_op(x, x)), float>::value),
                 "Result type of binary_op must match scan accumulation type.");
 #ifdef __SYCL_DEVICE_ONLY__
-  return detail::calc<T, __spv::GroupOperation::InclusiveScan,
-                      detail::spirv::group_scope<Group>::value>(
-      typename detail::GroupOpTag<T>::type(), x, binary_op);
+  return sycl::detail::calc<T, __spv::GroupOperation::InclusiveScan,
+                            sycl::detail::spirv::group_scope<Group>::value>(
+      typename sycl::detail::GroupOpTag<T>::type(), x, binary_op);
 #else
   throw runtime_error("Group algorithms are not supported on host device.",
                       PI_INVALID_DEVICE);
@@ -670,7 +674,7 @@ EnableIfIsScalarArithmetic<T> inclusive_scan(Group g, T x,
 template <typename Group, typename V, class BinaryOperation, typename T>
 EnableIfIsScalarArithmetic<T>
 inclusive_scan(Group g, V x, BinaryOperation binary_op, T init) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
   // FIXME: Do not special-case for half precision
@@ -679,7 +683,7 @@ inclusive_scan(Group g, V x, BinaryOperation binary_op, T init) {
                      std::is_same<decltype(binary_op(init, x)), float>::value),
                 "Result type of binary_op must match scan accumulation type.");
 #ifdef __SYCL_DEVICE_ONLY__
-  if (detail::get_local_linear_id(g) == 0) {
+  if (sycl::detail::get_local_linear_id(g) == 0) {
     x = binary_op(init, x);
   }
   return inclusive_scan(g, x, binary_op);
@@ -692,7 +696,7 @@ inclusive_scan(Group g, V x, BinaryOperation binary_op, T init) {
 template <typename Group, typename V, class BinaryOperation, typename T>
 EnableIfIsVectorArithmetic<T>
 inclusive_scan(Group g, V x, BinaryOperation binary_op, T init) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
   // FIXME: Do not special-case for half precision
@@ -713,7 +717,7 @@ template <typename Group, typename InPtr, typename OutPtr,
 EnableIfIsPointer<InPtr, OutPtr>
 inclusive_scan(Group g, InPtr first, InPtr last, OutPtr result,
                BinaryOperation binary_op, T init) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
   // FIXME: Do not special-case for half precision
@@ -723,8 +727,8 @@ inclusive_scan(Group g, InPtr first, InPtr last, OutPtr result,
            std::is_same<decltype(binary_op(init, *first)), float>::value),
       "Result type of binary_op must match scan accumulation type.");
 #ifdef __SYCL_DEVICE_ONLY__
-  ptrdiff_t offset = detail::get_local_linear_id(g);
-  ptrdiff_t stride = detail::get_local_linear_range(g);
+  ptrdiff_t offset = sycl::detail::get_local_linear_id(g);
+  ptrdiff_t stride = sycl::detail::get_local_linear_range(g);
   ptrdiff_t N = last - first;
   auto roundup = [=](const ptrdiff_t &v,
                      const ptrdiff_t &divisor) -> ptrdiff_t {
@@ -762,17 +766,18 @@ EnableIfIsPointer<InPtr, OutPtr> inclusive_scan(Group g, InPtr first,
           (std::is_same<typename OutPtr::element_type, half>::value &&
            std::is_same<decltype(binary_op(*first, *first)), float>::value),
       "Result type of binary_op must match scan accumulation type.");
-  return inclusive_scan(
-      g, first, last, result, binary_op,
-      detail::identity<typename OutPtr::element_type, BinaryOperation>::value);
+  return inclusive_scan(g, first, last, result, binary_op,
+                        sycl::detail::identity<typename OutPtr::element_type,
+                                               BinaryOperation>::value);
 }
 
 template <typename Group> bool leader(Group g) {
-  static_assert(detail::is_generic_group<Group>::value,
+  static_assert(sycl::detail::is_generic_group<Group>::value,
                 "Group algorithms only support the sycl::group and "
                 "intel::sub_group class.");
 #ifdef __SYCL_DEVICE_ONLY__
-  typename Group::linear_id_type linear_id = detail::get_local_linear_id(g);
+  typename Group::linear_id_type linear_id =
+      sycl::detail::get_local_linear_id(g);
   return (linear_id == 0);
 #else
   throw runtime_error("Group algorithms are not supported on host device.",
diff --git a/sycl/include/CL/sycl/intel/reduction.hpp b/sycl/include/CL/sycl/intel/reduction.hpp
new file mode 100644
index 0000000000000..2b0fb264bc94f
--- /dev/null
+++ b/sycl/include/CL/sycl/intel/reduction.hpp
@@ -0,0 +1,388 @@
+//==---------------- reduction.hpp - SYCL reduction ------------*- C++ -*---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// ===--------------------------------------------------------------------=== //
+
+#pragma once
+
+#include <CL/sycl/accessor.hpp>
+
+__SYCL_INLINE_NAMESPACE(cl) {
+namespace sycl {
+namespace intel {
+
+namespace detail {
+
+using cl::sycl::detail::bool_constant;
+using cl::sycl::detail::enable_if_t;
+using cl::sycl::detail::is_geninteger16bit;
+using cl::sycl::detail::is_geninteger32bit;
+using cl::sycl::detail::is_geninteger64bit;
+using cl::sycl::detail::is_geninteger8bit;
+using cl::sycl::detail::remove_AS;
+
+// Identity = 0
+template <typename T, class BinaryOperation>
+using IsZeroIdentityOp = bool_constant<
+    ((is_geninteger8bit<T>::value || is_geninteger16bit<T>::value ||
+      is_geninteger32bit<T>::value || is_geninteger64bit<T>::value) &&
+     (std::is_same<BinaryOperation, intel::plus<T>>::value ||
+      std::is_same<BinaryOperation, intel::bit_or<T>>::value ||
+      std::is_same<BinaryOperation, intel::bit_xor<T>>::value)) ||
+    ((std::is_same<T, float>::value || std::is_same<T, double>::value) &&
+     std::is_same<BinaryOperation, intel::plus<T>>::value)>;
+
+// Identity = 1
+template <typename T, class BinaryOperation>
+using IsOneIdentityOp = bool_constant<
+    (is_geninteger8bit<T>::value || is_geninteger16bit<T>::value ||
+     is_geninteger32bit<T>::value || is_geninteger64bit<T>::value ||
+     std::is_same<T, float>::value || std::is_same<T, double>::value) &&
+    std::is_same<BinaryOperation, std::multiplies<T>>::value>;
+
+// Identity = ~0
+template <typename T, class BinaryOperation>
+using IsOnesIdentityOp = bool_constant<
+    (is_geninteger8bit<T>::value || is_geninteger16bit<T>::value ||
+     is_geninteger32bit<T>::value || is_geninteger64bit<T>::value) &&
+    std::is_same<BinaryOperation, intel::bit_and<T>>::value>;
+
+// Identity = <max possible value>
+template <typename T, class BinaryOperation>
+using IsMinimumIdentityOp = bool_constant<
+    (is_geninteger8bit<T>::value || is_geninteger16bit<T>::value ||
+     is_geninteger32bit<T>::value || is_geninteger64bit<T>::value ||
+     std::is_same<T, float>::value || std::is_same<T, double>::value) &&
+    std::is_same<BinaryOperation, intel::minimum<T>>::value>;
+
+// Identity = <min possible value>
+template <typename T, class BinaryOperation>
+using IsMaximumIdentityOp = bool_constant<
+    (is_geninteger8bit<T>::value || is_geninteger16bit<T>::value ||
+     is_geninteger32bit<T>::value || is_geninteger64bit<T>::value ||
+     std::is_same<T, float>::value || std::is_same<T, double>::value) &&
+    std::is_same<BinaryOperation, intel::maximum<T>>::value>;
+
+template <typename T, class BinaryOperation>
+using IsKnownIdentityOp =
+    bool_constant<IsZeroIdentityOp<T, BinaryOperation>::value ||
+                  IsOneIdentityOp<T, BinaryOperation>::value ||
+                  IsOnesIdentityOp<T, BinaryOperation>::value ||
+                  IsMinimumIdentityOp<T, BinaryOperation>::value ||
+                  IsMaximumIdentityOp<T, BinaryOperation>::value>;
+
+/// Class that is used to represent objects that are passed to user's lambda
+/// functions and representing users' reduction variable.
+/// The generic version of the class represents those reductions of those
+/// types and operations for which the identity value is not known.
+template <typename T, class BinaryOperation, typename Subst = void>
+class reducer {
+public:
+  reducer(const T &Identity) : MValue(Identity), MIdentity(Identity) {}
+  void combine(const T &Partial) {
+    BinaryOperation BOp;
+    MValue = BOp(MValue, Partial);
+  }
+
+  T getIdentity() const { return MIdentity; }
+
+  T MValue;
+
+private:
+  const T MIdentity;
+};
+
+/// Specialization of the generic class 'reducer'. It is used for reductions
+/// of those types and operations for which the identity value is known.
+///
+/// It allows to reduce the size of the 'reducer' object by not holding
+/// the identity field inside it and allows to add a default constructor.
+///
+/// Also, for many types with known identity the operation 'atomic_combine()'
+/// is implemented here, which allows to use more efficient version of kernels
+/// using those operations, which are based on functionality provided by
+/// sycl::atomic class.
+///
+/// For example, it is known that 0 is identity for intel::plus operations
+/// accepting native scalar types to which scalar 0 is convertible.
+/// Also, for int32/64 types the atomic_combine() is lowered to
+/// sycl::atomic::fetch_add().
+//
+// TODO: More types and ops can be added to here later.
+template <typename T, class BinaryOperation>
+class reducer<T, BinaryOperation,
+              enable_if_t<IsKnownIdentityOp<T, BinaryOperation>::value>> {
+public:
+  reducer() : MValue(getIdentity()) {}
+  reducer(const T &Identity) : MValue(getIdentity()) {}
+
+  void combine(const T &Partial) {
+    BinaryOperation BOp;
+    MValue = BOp(MValue, Partial);
+  }
+
+  /// Returns zero as identity for ADD, OR, XOR operations.
+  template <typename _T = T, class _BinaryOperation = BinaryOperation>
+  static enable_if_t<IsZeroIdentityOp<_T, _BinaryOperation>::value, _T>
+  getIdentity() {
+    return 0;
+  }
+
+  /// Returns one as identify for MULTIPLY operations.
+  template <typename _T = T, class _BinaryOperation = BinaryOperation>
+  static enable_if_t<IsOneIdentityOp<_T, _BinaryOperation>::value, _T>
+  getIdentity() {
+    return 1;
+  }
+
+  /// Returns bit image consisting of all ones as identity for AND operations.
+  template <typename _T = T, class _BinaryOperation = BinaryOperation>
+  static enable_if_t<IsOnesIdentityOp<_T, _BinaryOperation>::value, _T>
+  getIdentity() {
+    return ~static_cast<_T>(0);
+  }
+
+  /// Returns maximal possible value as identity for MIN operations.
+  template <typename _T = T, class _BinaryOperation = BinaryOperation>
+  static enable_if_t<IsMinimumIdentityOp<_T, _BinaryOperation>::value, _T>
+  getIdentity() {
+    return (std::numeric_limits<_T>::max)();
+  }
+
+  /// Returns minimal possible value as identity for MAX operations.
+  template <typename _T = T, class _BinaryOperation = BinaryOperation>
+  static enable_if_t<IsMaximumIdentityOp<_T, _BinaryOperation>::value, _T>
+  getIdentity() {
+    return (std::numeric_limits<_T>::min)();
+  }
+
+  template <typename _T = T>
+  enable_if_t<std::is_same<_T, T>::value &&
+                  std::is_same<BinaryOperation, intel::plus<T>>::value,
+              reducer &>
+  operator+=(const _T &Partial) {
+    combine(Partial);
+    return *this;
+  }
+
+  template <typename _T = T>
+  enable_if_t<std::is_same<_T, T>::value &&
+                  std::is_same<BinaryOperation, std::multiplies<T>>::value,
+              reducer &>
+  operator*=(const _T &Partial) {
+    combine(Partial);
+    return *this;
+  }
+
+  template <typename _T = T>
+  enable_if_t<std::is_same<_T, T>::value &&
+                  std::is_same<BinaryOperation, intel::bit_or<T>>::value,
+              reducer &>
+  operator|=(const _T &Partial) {
+    combine(Partial);
+    return *this;
+  }
+
+  template <typename _T = T>
+  enable_if_t<std::is_same<_T, T>::value &&
+                  std::is_same<BinaryOperation, intel::bit_xor<T>>::value,
+              reducer &>
+  operator^=(const _T &Partial) {
+    combine(Partial);
+    return *this;
+  }
+
+  template <typename _T = T>
+  enable_if_t<std::is_same<_T, T>::value &&
+                  std::is_same<BinaryOperation, intel::bit_and<T>>::value,
+              reducer &>
+  operator&=(const _T &Partial) {
+    combine(Partial);
+    return *this;
+  }
+
+  /// Atomic ADD operation: *ReduVarPtr += MValue;
+  template <typename _T = T, class _BinaryOperation = BinaryOperation>
+  enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
+              (is_geninteger32bit<T>::value || is_geninteger64bit<T>::value) &&
+              std::is_same<_BinaryOperation, intel::plus<T>>::value>
+  atomic_combine(_T *ReduVarPtr) const {
+    atomic<T, access::address_space::global_space>(global_ptr<T>(ReduVarPtr))
+        .fetch_add(MValue);
+  }
+
+  /// Atomic BITWISE OR operation: *ReduVarPtr |= MValue;
+  template <typename _T = T, class _BinaryOperation = BinaryOperation>
+  enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
+              (is_geninteger32bit<T>::value || is_geninteger64bit<T>::value) &&
+              std::is_same<_BinaryOperation, intel::bit_or<T>>::value>
+  atomic_combine(_T *ReduVarPtr) const {
+    atomic<T, access::address_space::global_space>(global_ptr<T>(ReduVarPtr))
+        .fetch_or(MValue);
+  }
+
+  /// Atomic BITWISE XOR operation: *ReduVarPtr ^= MValue;
+  template <typename _T = T, class _BinaryOperation = BinaryOperation>
+  enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
+              (is_geninteger32bit<T>::value || is_geninteger64bit<T>::value) &&
+              std::is_same<_BinaryOperation, intel::bit_xor<T>>::value>
+  atomic_combine(_T *ReduVarPtr) const {
+    atomic<T, access::address_space::global_space>(global_ptr<T>(ReduVarPtr))
+        .fetch_xor(MValue);
+  }
+
+  /// Atomic BITWISE AND operation: *ReduVarPtr &= MValue;
+  template <typename _T = T, class _BinaryOperation = BinaryOperation>
+  enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
+              (is_geninteger32bit<T>::value || is_geninteger64bit<T>::value) &&
+              std::is_same<_BinaryOperation, intel::bit_and<T>>::value>
+  atomic_combine(_T *ReduVarPtr) const {
+    atomic<T, access::address_space::global_space>(global_ptr<T>(ReduVarPtr))
+        .fetch_and(MValue);
+  }
+
+  /// Atomic MIN operation: *ReduVarPtr = intel::minimum(*ReduVarPtr, MValue);
+  template <typename _T = T, class _BinaryOperation = BinaryOperation>
+  enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
+              (is_geninteger32bit<T>::value || is_geninteger64bit<T>::value) &&
+              std::is_same<_BinaryOperation, intel::minimum<T>>::value>
+  atomic_combine(_T *ReduVarPtr) const {
+    atomic<T, access::address_space::global_space>(global_ptr<T>(ReduVarPtr))
+        .fetch_min(MValue);
+  }
+
+  /// Atomic MAX operation: *ReduVarPtr = intel::maximum(*ReduVarPtr, MValue);
+  template <typename _T = T, class _BinaryOperation = BinaryOperation>
+  enable_if_t<std::is_same<typename remove_AS<_T>::type, T>::value &&
+              (is_geninteger32bit<T>::value || is_geninteger64bit<T>::value) &&
+              std::is_same<_BinaryOperation, intel::maximum<T>>::value>
+  atomic_combine(_T *ReduVarPtr) const {
+    atomic<T, access::address_space::global_space>(global_ptr<T>(ReduVarPtr))
+        .fetch_max(MValue);
+  }
+
+  T MValue;
+};
+
+/// This class encapsulates the reduction variable/accessor,
+/// the reduction operator and an optional operator identity.
+template <typename T, class BinaryOperation, int Dims, access::mode AccMode,
+          access::placeholder IsPlaceholder>
+class reduction_impl {
+public:
+  using reducer_type = reducer<T, BinaryOperation>;
+  using result_type = T;
+  using binary_operation = BinaryOperation;
+  using accessor_type =
+      accessor<T, Dims, AccMode, access::target::global_buffer, IsPlaceholder>;
+  static constexpr access::mode accessor_mode = AccMode;
+  static constexpr int accessor_dim = Dims;
+  static constexpr int buffer_dim = (Dims == 0) ? 1 : Dims;
+
+  // Only scalar (i.e. 0-dim and 1-dim with 1 element) reductions supported now.
+  // TODO: suport (Dims > 1) and placeholder accessors/reductions.
+  // TODO: support true 1-Dimensional accessors/reductions (get_count() > 1).
+  // (get_count() == 1) is checked in the constructor of reduction_impl.
+  static_assert(Dims <= 1 && IsPlaceholder == access::placeholder::false_t,
+                "Multi-dimensional and placeholder reductions"
+                " are not supported yet.");
+
+  /// Returns the statically known identity value.
+  template <typename _T = T, class _BinaryOperation = BinaryOperation>
+  enable_if_t<IsKnownIdentityOp<_T, _BinaryOperation>::value,
+              _T> constexpr getIdentity() {
+    return reducer_type::getIdentity();
+  }
+
+  /// Returns the identity value given by user.
+  template <typename _T = T, class _BinaryOperation = BinaryOperation>
+  enable_if_t<!IsKnownIdentityOp<_T, _BinaryOperation>::value, _T>
+  getIdentity() {
+    return MIdentity;
+  }
+
+  /// Constructs reduction_impl when the identity value is statically known.
+  template <
+      typename _T = T, class _BinaryOperation = BinaryOperation,
+      enable_if_t<IsKnownIdentityOp<_T, _BinaryOperation>::value> * = nullptr>
+  reduction_impl(accessor_type &Acc) : MAcc(Acc), MIdentity(getIdentity()) {
+    assert(Acc.get_count() == 1 &&
+           "Only scalar/1-element reductions are supported now.");
+  }
+
+  /// Constructs reduction_impl when the identity value is statically known,
+  /// and user still passed the identity value.
+  template <
+      typename _T = T, class _BinaryOperation = BinaryOperation,
+      enable_if_t<IsKnownIdentityOp<_T, _BinaryOperation>::value> * = nullptr>
+  reduction_impl(accessor_type &Acc, const T &Identity)
+      : MAcc(Acc), MIdentity(Identity) {
+    assert(Acc.get_count() == 1 &&
+           "Only scalar/1-element reductions are supported now.");
+    // For operations with known identity value the operator == is defined.
+    // It is sort of dilemma here: from one point of view - user may set
+    // such identity that would be enough for his data, i.e. identity=100 for
+    // min operation if user knows all data elements are less than 100.
+    // From another point of view - it is the source of unexpected errors,
+    // when the input data changes.
+    // Let's be strict for now and emit an error if identity is not proper.
+    assert(Identity == getIdentity() && "Unexpected Identity parameter value.");
+  }
+
+  /// Constructs reduction_impl when the identity value is unknown.
+  template <
+      typename _T = T, class _BinaryOperation = BinaryOperation,
+      enable_if_t<!IsKnownIdentityOp<_T, _BinaryOperation>::value> * = nullptr>
+  reduction_impl(accessor_type &Acc, const T &Identity)
+      : MAcc(Acc), MIdentity(Identity) {
+    assert(Acc.get_count() == 1 &&
+           "Only scalar/1-element reductions are supported now.");
+  }
+
+  /// User's accessor to where the reduction must be written.
+  accessor_type MAcc;
+
+private:
+  /// Identity of the BinaryOperation.
+  /// The result of BinaryOperation(X, MIdentity) is equal to X for any X.
+  const T MIdentity;
+};
+
+} // namespace detail
+
+/// Creates and returns an object implementing the reduction functionality.
+/// Accepts 3 arguments: the accessor to buffer to where the computed reduction
+/// must be stored \param Acc, identity value \param Identity, and the
+/// binary operation that must be used in the reduction \param Combiner.
+template <typename T, class BinaryOperation, int Dims, access::mode AccMode,
+          access::placeholder IsPH>
+detail::reduction_impl<T, BinaryOperation, Dims, AccMode, IsPH>
+reduction(accessor<T, Dims, AccMode, access::target::global_buffer, IsPH> &Acc,
+          const T &Identity, BinaryOperation Combiner) {
+  // The Combiner argument was needed only to define the BinaryOperation param.
+  return detail::reduction_impl<T, BinaryOperation, Dims, AccMode, IsPH>(
+      Acc, Identity);
+}
+
+/// Creates and returns an object implementing the reduction functionality.
+/// Accepts 2 arguments: the accessor to buffer to where the computed reduction
+/// must be stored \param Acc and the binary operation that must be used
+/// in the reduction \param Combiner.
+/// The identity value is not passed to this version as it is statically known.
+template <typename T, class BinaryOperation, int Dims, access::mode AccMode,
+          access::placeholder IsPH>
+detail::enable_if_t<
+    detail::IsKnownIdentityOp<T, BinaryOperation>::value,
+    detail::reduction_impl<T, BinaryOperation, Dims, AccMode, IsPH>>
+reduction(accessor<T, Dims, AccMode, access::target::global_buffer, IsPH> &Acc,
+          BinaryOperation Combiner) {
+  // The Combiner argument was needed only to define the BinaryOperation param.
+  return detail::reduction_impl<T, BinaryOperation, Dims, AccMode, IsPH>(Acc);
+}
+
+} // namespace intel
+} // namespace sycl
+} // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/CL/sycl/intel/sub_group.hpp b/sycl/include/CL/sycl/intel/sub_group.hpp
index 91d1b748e3a41..648fefe66c85e 100644
--- a/sycl/include/CL/sycl/intel/sub_group.hpp
+++ b/sycl/include/CL/sycl/intel/sub_group.hpp
@@ -36,7 +36,7 @@ namespace sub_group {
 
 #define __SYCL_SG_GENERATE_BODY_1ARG(name, SPIRVOperation)                     \
   template <typename T> T name(T x, id<1> local_id) {                          \
-    using OCLT = detail::ConvertToOpenCLType_t<T>;                             \
+    using OCLT = sycl::detail::ConvertToOpenCLType_t<T>;                       \
     return __spirv_##SPIRVOperation(OCLT(x), local_id.get(0));                 \
   }
 
@@ -47,7 +47,7 @@ __SYCL_SG_GENERATE_BODY_1ARG(shuffle_xor, SubgroupShuffleXorINTEL)
 
 #define __SYCL_SG_GENERATE_BODY_2ARG(name, SPIRVOperation)                     \
   template <typename T> T name(T A, T B, uint32_t Delta) {                     \
-    using OCLT = detail::ConvertToOpenCLType_t<T>;                             \
+    using OCLT = sycl::detail::ConvertToOpenCLType_t<T>;                       \
     return __spirv_##SPIRVOperation(OCLT(A), OCLT(B), Delta);                  \
   }
 
@@ -77,7 +77,7 @@ template <typename To, typename From> To bit_cast(const From &from) {
   return __builtin_bit_cast(To, from);
 #else
   To to;
-  detail::memcpy(&to, &from, sizeof(To));
+  sycl::detail::memcpy(&to, &from, sizeof(To));
   return to;
 #endif // __has_builtin(__builtin_bit_cast)
 #endif // __cpp_lib_bit_cast
@@ -86,7 +86,8 @@ template <typename To, typename From> To bit_cast(const From &from) {
 template <typename T, access::address_space Space>
 T load(const multi_ptr<T, Space> src) {
   using BlockT = SelectBlockT<T>;
-  using PtrT = detail::ConvertToOpenCLType_t<const multi_ptr<BlockT, Space>>;
+  using PtrT =
+      sycl::detail::ConvertToOpenCLType_t<const multi_ptr<BlockT, Space>>;
 
   BlockT Ret =
       __spirv_SubgroupBlockReadINTEL<BlockT>(reinterpret_cast<PtrT>(src.get()));
@@ -97,8 +98,9 @@ T load(const multi_ptr<T, Space> src) {
 template <int N, typename T, access::address_space Space>
 vec<T, N> load(const multi_ptr<T, Space> src) {
   using BlockT = SelectBlockT<T>;
-  using VecT = detail::ConvertToOpenCLType_t<vec<BlockT, N>>;
-  using PtrT = detail::ConvertToOpenCLType_t<const multi_ptr<BlockT, Space>>;
+  using VecT = sycl::detail::ConvertToOpenCLType_t<vec<BlockT, N>>;
+  using PtrT =
+      sycl::detail::ConvertToOpenCLType_t<const multi_ptr<BlockT, Space>>;
 
   VecT Ret =
       __spirv_SubgroupBlockReadINTEL<VecT>(reinterpret_cast<PtrT>(src.get()));
@@ -109,7 +111,7 @@ vec<T, N> load(const multi_ptr<T, Space> src) {
 template <typename T, access::address_space Space>
 void store(multi_ptr<T, Space> dst, const T &x) {
   using BlockT = SelectBlockT<T>;
-  using PtrT = detail::ConvertToOpenCLType_t<multi_ptr<BlockT, Space>>;
+  using PtrT = sycl::detail::ConvertToOpenCLType_t<multi_ptr<BlockT, Space>>;
 
   __spirv_SubgroupBlockWriteINTEL(reinterpret_cast<PtrT>(dst.get()),
                                   bit_cast<BlockT>(x));
@@ -118,8 +120,9 @@ void store(multi_ptr<T, Space> dst, const T &x) {
 template <int N, typename T, access::address_space Space>
 void store(multi_ptr<T, Space> dst, const vec<T, N> &x) {
   using BlockT = SelectBlockT<T>;
-  using VecT = detail::ConvertToOpenCLType_t<vec<BlockT, N>>;
-  using PtrT = detail::ConvertToOpenCLType_t<const multi_ptr<BlockT, Space>>;
+  using VecT = sycl::detail::ConvertToOpenCLType_t<vec<BlockT, N>>;
+  using PtrT =
+      sycl::detail::ConvertToOpenCLType_t<const multi_ptr<BlockT, Space>>;
 
   __spirv_SubgroupBlockWriteINTEL(reinterpret_cast<PtrT>(dst.get()),
                                   bit_cast<VecT>(x));
@@ -171,22 +174,23 @@ struct sub_group {
 
   template <typename T>
   using EnableIfIsScalarArithmetic =
-      detail::enable_if_t<detail::is_scalar_arithmetic<T>::value, T>;
+      sycl::detail::enable_if_t<sycl::detail::is_scalar_arithmetic<T>::value,
+                                T>;
 
   /* --- collectives --- */
 
   template <typename T>
   __SYCL_EXPORT_DEPRECATED("Use sycl::intel::broadcast instead.")
   EnableIfIsScalarArithmetic<T> broadcast(T x, id<1> local_id) const {
-    return detail::spirv::GroupBroadcast<sub_group>(x, local_id);
+    return sycl::detail::spirv::GroupBroadcast<sub_group>(x, local_id);
   }
 
   template <typename T, class BinaryOperation>
   __SYCL_EXPORT_DEPRECATED("Use sycl::intel::reduce instead.")
   EnableIfIsScalarArithmetic<T> reduce(T x, BinaryOperation op) const {
-    return detail::calc<T, __spv::GroupOperation::Reduce,
-                        __spv::Scope::Subgroup>(
-        typename detail::GroupOpTag<T>::type(), x, op);
+    return sycl::detail::calc<T, __spv::GroupOperation::Reduce,
+                              __spv::Scope::Subgroup>(
+        typename sycl::detail::GroupOpTag<T>::type(), x, op);
   }
 
   template <typename T, class BinaryOperation>
@@ -198,9 +202,9 @@ struct sub_group {
   template <typename T, class BinaryOperation>
   __SYCL_EXPORT_DEPRECATED("Use sycl::intel::exclusive_scan instead.")
   EnableIfIsScalarArithmetic<T> exclusive_scan(T x, BinaryOperation op) const {
-    return detail::calc<T, __spv::GroupOperation::ExclusiveScan,
-                        __spv::Scope::Subgroup>(
-        typename detail::GroupOpTag<T>::type(), x, op);
+    return sycl::detail::calc<T, __spv::GroupOperation::ExclusiveScan,
+                              __spv::Scope::Subgroup>(
+        typename sycl::detail::GroupOpTag<T>::type(), x, op);
   }
 
   template <typename T, class BinaryOperation>
@@ -220,9 +224,9 @@ struct sub_group {
   template <typename T, class BinaryOperation>
   __SYCL_EXPORT_DEPRECATED("Use sycl::intel::inclusive_scan instead.")
   EnableIfIsScalarArithmetic<T> inclusive_scan(T x, BinaryOperation op) const {
-    return detail::calc<T, __spv::GroupOperation::InclusiveScan,
-                        __spv::Scope::Subgroup>(
-        typename detail::GroupOpTag<T>::type(), x, op);
+    return sycl::detail::calc<T, __spv::GroupOperation::InclusiveScan,
+                              __spv::Scope::Subgroup>(
+        typename sycl::detail::GroupOpTag<T>::type(), x, op);
   }
 
   template <typename T, class BinaryOperation>
@@ -239,90 +243,94 @@ struct sub_group {
   /* indices in [0 , sub_group size) */
 
   template <typename T> T shuffle(T x, id<1> local_id) const {
-    return detail::sub_group::shuffle(x, local_id);
+    return sycl::detail::sub_group::shuffle(x, local_id);
   }
 
   template <typename T> T shuffle_down(T x, uint32_t delta) const {
-    return detail::sub_group::shuffle_down(x, x, delta);
+    return sycl::detail::sub_group::shuffle_down(x, x, delta);
   }
 
   template <typename T> T shuffle_up(T x, uint32_t delta) const {
-    return detail::sub_group::shuffle_up(x, x, delta);
+    return sycl::detail::sub_group::shuffle_up(x, x, delta);
   }
 
   template <typename T> T shuffle_xor(T x, id<1> value) const {
-    return detail::sub_group::shuffle_xor(x, value);
+    return sycl::detail::sub_group::shuffle_xor(x, value);
   }
 
   /* --- two-input shuffles --- */
   /* indices in [0 , 2 * sub_group size) */
 
   template <typename T> T shuffle(T x, T y, id<1> local_id) const {
-    return detail::sub_group::shuffle_down(x, y,
-                                           (local_id - get_local_id()).get(0));
+    return sycl::detail::sub_group::shuffle_down(
+        x, y, (local_id - get_local_id()).get(0));
   }
 
   template <typename T>
   T shuffle_down(T current, T next, uint32_t delta) const {
-    return detail::sub_group::shuffle_down(current, next, delta);
+    return sycl::detail::sub_group::shuffle_down(current, next, delta);
   }
 
   template <typename T>
   T shuffle_up(T previous, T current, uint32_t delta) const {
-    return detail::sub_group::shuffle_up(previous, current, delta);
+    return sycl::detail::sub_group::shuffle_up(previous, current, delta);
   }
 
   /* --- sub_group load/stores --- */
   /* these can map to SIMD or block read/write hardware where available */
 
   template <typename T, access::address_space Space>
-  detail::enable_if_t<
-      detail::sub_group::AcceptableForLoadStore<T, Space>::value, T>
+  sycl::detail::enable_if_t<
+      sycl::detail::sub_group::AcceptableForLoadStore<T, Space>::value, T>
   load(const multi_ptr<T, Space> src) const {
-    return detail::sub_group::load(src);
+    return sycl::detail::sub_group::load(src);
   }
 
   template <int N, typename T, access::address_space Space>
-  detail::enable_if_t<
-      detail::sub_group::AcceptableForLoadStore<T, Space>::value && N != 1,
+  sycl::detail::enable_if_t<
+      sycl::detail::sub_group::AcceptableForLoadStore<T, Space>::value &&
+          N != 1,
       vec<T, N>>
   load(const multi_ptr<T, Space> src) const {
-    return detail::sub_group::load<N, T>(src);
+    return sycl::detail::sub_group::load<N, T>(src);
   }
 
   template <int N, typename T, access::address_space Space>
-  detail::enable_if_t<
-      detail::sub_group::AcceptableForLoadStore<T, Space>::value && N == 1,
+  sycl::detail::enable_if_t<
+      sycl::detail::sub_group::AcceptableForLoadStore<T, Space>::value &&
+          N == 1,
       vec<T, 1>>
   load(const multi_ptr<T, Space> src) const {
-    return detail::sub_group::load(src);
+    return sycl::detail::sub_group::load(src);
   }
 
   template <typename T, access::address_space Space>
-  detail::enable_if_t<
-      detail::sub_group::AcceptableForLoadStore<T, Space>::value>
+  sycl::detail::enable_if_t<
+      sycl::detail::sub_group::AcceptableForLoadStore<T, Space>::value>
   store(multi_ptr<T, Space> dst, const T &x) const {
-    detail::sub_group::store(dst, x);
+    sycl::detail::sub_group::store(dst, x);
   }
 
   template <int N, typename T, access::address_space Space>
-  detail::enable_if_t<
-      detail::sub_group::AcceptableForLoadStore<T, Space>::value && N == 1>
+  sycl::detail::enable_if_t<
+      sycl::detail::sub_group::AcceptableForLoadStore<T, Space>::value &&
+      N == 1>
   store(multi_ptr<T, Space> dst, const vec<T, 1> &x) const {
     store<T, Space>(dst, x);
   }
 
   template <int N, typename T, access::address_space Space>
-  detail::enable_if_t<
-      detail::sub_group::AcceptableForLoadStore<T, Space>::value && N != 1>
+  sycl::detail::enable_if_t<
+      sycl::detail::sub_group::AcceptableForLoadStore<T, Space>::value &&
+      N != 1>
   store(multi_ptr<T, Space> dst, const vec<T, N> &x) const {
-    detail::sub_group::store(dst, x);
+    sycl::detail::sub_group::store(dst, x);
   }
 
   /* --- synchronization functions --- */
   void barrier(access::fence_space accessSpace =
                    access::fence_space::global_and_local) const {
-    uint32_t flags = detail::getSPIRVMemorySemanticsMask(accessSpace);
+    uint32_t flags = sycl::detail::getSPIRVMemorySemanticsMask(accessSpace);
     __spirv_ControlBarrier(__spv::Scope::Subgroup, __spv::Scope::Subgroup,
                            flags);
   }
diff --git a/sycl/test/reduction/reduction_ctor.cpp b/sycl/test/reduction/reduction_ctor.cpp
new file mode 100644
index 0000000000000..9e4fb306a95b9
--- /dev/null
+++ b/sycl/test/reduction/reduction_ctor.cpp
@@ -0,0 +1,175 @@
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+//==----------------reduction_ctor.cpp - SYCL reduction basic test ---------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This performs basic checks such as reduction creation, getIdentity() method,
+// and the combine() method of the aux class 'reducer'.
+
+#include <CL/sycl.hpp>
+#include <cassert>
+
+using namespace cl::sycl;
+
+template <typename T, class BinaryOperation, int N>
+struct init_data_t {
+  void initInputData(T IdentityVal,
+                     buffer<T, 1> &InBuf,
+                     T &ExpectedReduValue) {
+    ExpectedReduValue = IdentityVal;
+    BinaryOperation Op;
+    auto In = InBuf.template get_access<access::mode::write>();
+    for (int I = 0; I < N; ++I) {
+      In[I] = ((I + 1) % 5) + 1;
+      ExpectedReduValue = Op(ExpectedReduValue, In[I]);
+    }
+  }
+};
+
+template <typename T, int N>
+struct init_data_t<T, std::multiplies<T>, N> {
+  void initInputData(T IdentityVal, buffer<T, 1> &InBuf, T &ExpectedReduValue) {
+    ExpectedReduValue = IdentityVal;
+    std::multiplies<T> Op;
+    auto In = InBuf.template get_access<access::mode::write>();
+    for (int I = 0; I < N; ++I) {
+      In[I] = 1 + (((I % 37) == 0) ? 1 : 0);
+      ExpectedReduValue = Op(ExpectedReduValue, In[I]);
+    }
+  }
+};
+
+template <typename T, typename Reduction>
+void test_reducer(Reduction &Redu, T A, T B) {
+  typename Reduction::reducer_type Reducer;
+  Reducer.combine(A);
+  Reducer.combine(B);
+
+  typename Reduction::binary_operation BOp;
+  T ExpectedValue = BOp(A, B);
+  assert(ExpectedValue == Reducer.MValue &&
+         "Wrong result of binary operation.");
+}
+
+template <typename T, typename Reduction>
+void test_reducer(Reduction &Redu, T Identity, T A, T B) {
+  typename Reduction::reducer_type Reducer(Identity);
+  Reducer.combine(A);
+  Reducer.combine(B);
+
+  typename Reduction::binary_operation BOp;
+  T ExpectedValue = BOp(A, B);
+  assert(ExpectedValue == Reducer.MValue &&
+         "Wrong result of binary operation.");
+}
+
+template <typename T, int Dim, class BinaryOperation>
+class Known;
+template <typename T, int Dim, class BinaryOperation>
+class Unknown;
+
+template <typename T>
+struct Point {
+  Point() : X(0), Y(0) {}
+  Point(T X, T Y) : X(X), Y(Y) {}
+  Point(T V) : X(V), Y(V) {}
+  bool operator==(const Point &P) const {
+    return P.X == X && P.Y == Y;
+  }
+  T X;
+  T Y;
+};
+
+template <typename T>
+bool operator==(const Point<T> &A, const Point<T> &B) {
+  return A.X == B.X && A.Y == B.Y;
+}
+
+template <class T>
+struct PointPlus {
+  using P = Point<T>;
+  P operator()(const P &A, const P &B) const {
+    return P(A.X + B.X, A.Y + B.Y);
+  }
+};
+
+template <typename T, int Dim, class BinaryOperation>
+void testKnown(T Identity, T A, T B) {
+
+  BinaryOperation BOp;
+  buffer<T, 1> ReduBuf(1);
+
+  queue Q;
+  Q.submit([&](handler &CGH) {
+    // Reduction needs a global_buffer accessor as a parameter.
+    // This accessor is not really used in this test.
+    accessor<T, Dim, access::mode::discard_write, access::target::global_buffer>
+        ReduAcc(ReduBuf, CGH);
+    auto Redu = intel::reduction(ReduAcc, BOp);
+    assert(Redu.getIdentity() == Identity &&
+           "Failed getIdentity() check().");
+    test_reducer(Redu, A, B);
+    test_reducer(Redu, Identity, A, B);
+
+    // Command group must have at least one task in it. Use an empty one.
+    CGH.single_task<Known<T, Dim, BinaryOperation>>([=]() {});
+  });
+}
+
+template <typename T, int Dim, class BinaryOperation>
+void testUnknown(T Identity, T A, T B) {
+
+  BinaryOperation BOp;
+  buffer<T, 1> ReduBuf(1);
+  queue Q;
+  Q.submit([&](handler &CGH) {
+    // Reduction needs a global_buffer accessor as a parameter.
+    // This accessor is not really used in this test.
+    accessor<T, Dim, access::mode::discard_write, access::target::global_buffer>
+        ReduAcc(ReduBuf, CGH);
+    auto Redu = intel::reduction(ReduAcc, Identity, BOp);
+    assert(Redu.getIdentity() == Identity &&
+           "Failed getIdentity() check().");
+    test_reducer(Redu, Identity, A, B);
+
+    // Command group must have at least one task in it. Use an empty one.
+    CGH.single_task<Unknown<T, Dim, BinaryOperation>>([=]() {});
+  });
+}
+
+template <typename T, class BinaryOperation>
+void testBoth(T Identity, T A, T B) {
+  testKnown<T, 0, BinaryOperation>(Identity, A, B);
+  testKnown<T, 1, BinaryOperation>(Identity, A, B);
+  testUnknown<T, 0, BinaryOperation>(Identity, A, B);
+  testUnknown<T, 1, BinaryOperation>(Identity, A, B);
+}
+
+int main() {
+  // testKnown does not pass identity to reduction ctor.
+  testBoth<int, intel::plus<int>>(0, 1, 7);
+  testBoth<int, std::multiplies<int>>(1, 1, 7);
+  testBoth<int, intel::bit_or<int>>(0, 1, 8);
+  testBoth<int, intel::bit_xor<int>>(0, 7, 3);
+  testBoth<int, intel::bit_and<int>>(~0, 7, 3);
+  testBoth<int, intel::minimum<int>>(std::numeric_limits<int>::max(), 7, 3);
+  testBoth<int, intel::maximum<int>>(std::numeric_limits<int>::min(), 7, 3);
+
+  testBoth<float, intel::plus<float>>(0, 1, 7);
+  testBoth<float, std::multiplies<float>>(1, 1, 7);
+  testBoth<float, intel::minimum<float>>(
+      std::numeric_limits<float>::max(), 7, 3);
+  testBoth<float, intel::maximum<float>>(
+      std::numeric_limits<float>::min(), 7, 3);
+
+  testUnknown<Point<float>, 0, PointPlus<float>>(Point<float>(0), Point<float>(1), Point<float>(7));
+  testUnknown<Point<float>, 1, PointPlus<float>>(Point<float>(0), Point<float>(1), Point<float>(7));
+
+  std::cout << "Test passed\n";
+  return 0;
+}