diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp
index 38bcba20d8a27..a773694f69db7 100644
--- a/sycl/include/CL/__spirv/spirv_ops.hpp
+++ b/sycl/include/CL/__spirv/spirv_ops.hpp
@@ -191,7 +191,7 @@ extern SYCL_EXTERNAL bool __spirv_GroupAny(__spv::Scope Execution,
 template <typename dataT>
 extern SYCL_EXTERNAL dataT __spirv_GroupBroadcast(__spv::Scope Execution,
                                                   dataT Value,
-                                                  uint32_t LocalId) noexcept;
+                                                  size_t LocalId) noexcept;
 
 template <typename dataT>
 extern SYCL_EXTERNAL dataT
diff --git a/sycl/include/CL/sycl.hpp b/sycl/include/CL/sycl.hpp
index 4c6e6e8790052..3455e0ab07219 100644
--- a/sycl/include/CL/sycl.hpp
+++ b/sycl/include/CL/sycl.hpp
@@ -23,22 +23,22 @@
 #include <CL/sycl/image.hpp>
 #include <CL/sycl/intel/builtins.hpp>
 #include <CL/sycl/intel/function_pointer.hpp>
+#include <CL/sycl/intel/group_algorithm.hpp>
 #include <CL/sycl/intel/sub_group.hpp>
 #include <CL/sycl/item.hpp>
 #include <CL/sycl/kernel.hpp>
 #include <CL/sycl/multi_ptr.hpp>
 #include <CL/sycl/nd_item.hpp>
 #include <CL/sycl/nd_range.hpp>
+#include <CL/sycl/ordered_queue.hpp>
 #include <CL/sycl/pipes.hpp>
 #include <CL/sycl/platform.hpp>
 #include <CL/sycl/pointers.hpp>
 #include <CL/sycl/program.hpp>
 #include <CL/sycl/queue.hpp>
-#include <CL/sycl/ordered_queue.hpp>
 #include <CL/sycl/range.hpp>
 #include <CL/sycl/sampler.hpp>
 #include <CL/sycl/stream.hpp>
 #include <CL/sycl/types.hpp>
 #include <CL/sycl/usm.hpp>
 #include <CL/sycl/version.hpp>
-
diff --git a/sycl/include/CL/sycl/detail/spirv.hpp b/sycl/include/CL/sycl/detail/spirv.hpp
new file mode 100644
index 0000000000000..8c5f80f3674b2
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/spirv.hpp
@@ -0,0 +1,55 @@
+//===-- spirv.hpp - Helpers to generate SPIR-V instructions ----*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/__spirv/spirv_ops.hpp>
+#include <CL/__spirv/spirv_types.hpp>
+#include <CL/__spirv/spirv_vars.hpp>
+#include <CL/sycl/detail/generic_type_traits.hpp>
+#include <CL/sycl/detail/type_traits.hpp>
+
+#ifdef __SYCL_DEVICE_ONLY__
+__SYCL_INLINE_NAMESPACE(cl) {
+namespace sycl {
+namespace detail {
+namespace spirv {
+
+// Broadcast with scalar local index
+template <__spv::Scope S, typename T, typename IdT>
+detail::enable_if_t<std::is_integral<IdT>::value, T>
+GroupBroadcast(T x, IdT local_id) {
+  using OCLT = detail::ConvertToOpenCLType_t<T>;
+  using OCLIdT = detail::ConvertToOpenCLType_t<IdT>;
+  OCLT ocl_x = detail::convertDataToType<T, OCLT>(x);
+  OCLIdT ocl_id = detail::convertDataToType<IdT, OCLIdT>(local_id);
+  return __spirv_GroupBroadcast(S, ocl_x, ocl_id);
+}
+
+// Broadcast with vector local index
+template <__spv::Scope S, typename T, int Dimensions>
+T GroupBroadcast(T x, id<Dimensions> local_id) {
+  if (Dimensions == 1) {
+    return GroupBroadcast<S>(x, local_id[0]);
+  }
+  using IdT = vec<size_t, Dimensions>;
+  using OCLT = detail::ConvertToOpenCLType_t<T>;
+  using OCLIdT = detail::ConvertToOpenCLType_t<IdT>;
+  IdT vec_id;
+  for (int i = 0; i < Dimensions; ++i) {
+    vec_id[i] = local_id[Dimensions - i - 1];
+  }
+  OCLT ocl_x = detail::convertDataToType<T, OCLT>(x);
+  OCLIdT ocl_id = detail::convertDataToType<IdT, OCLIdT>(vec_id);
+  return __spirv_GroupBroadcast(S, ocl_x, ocl_id);
+}
+
+} // namespace spirv
+} // namespace detail
+} // namespace sycl
+} // __SYCL_INLINE_NAMESPACE(cl)
+#endif //  __SYCL_DEVICE_ONLY__
diff --git a/sycl/include/CL/sycl/detail/type_traits.hpp b/sycl/include/CL/sycl/detail/type_traits.hpp
index 4050c2742b66d..6b0bbd255f103 100644
--- a/sycl/include/CL/sycl/detail/type_traits.hpp
+++ b/sycl/include/CL/sycl/detail/type_traits.hpp
@@ -195,6 +195,14 @@ template <typename T>
 struct is_arithmetic
     : bool_constant<is_integral<T>::value || is_floating_point<T>::value> {};
 
+template <typename T>
+struct is_scalar_arithmetic
+    : bool_constant<!is_vec<T>::value && is_arithmetic<T>::value> {};
+
+template <typename T>
+struct is_vector_arithmetic
+    : bool_constant<is_vec<T>::value && is_arithmetic<T>::value> {};
+
 // is_pointer
 template <typename T> struct is_pointer_impl : std::false_type {};
 
diff --git a/sycl/include/CL/sycl/group.hpp b/sycl/include/CL/sycl/group.hpp
index 7fc777a53e644..c871ec95bd0bf 100644
--- a/sycl/include/CL/sycl/group.hpp
+++ b/sycl/include/CL/sycl/group.hpp
@@ -81,25 +81,32 @@ template <typename T, int Dimensions = 1> class private_memory {
 #endif // #ifdef __SYCL_DEVICE_ONLY__
 };
 
-template <int dimensions = 1> class group {
+template <int Dimensions = 1> class group {
 public:
+#ifndef __DISABLE_SYCL_INTEL_GROUP_ALGORITHMS__
+  using id_type = id<Dimensions>;
+  using range_type = range<Dimensions>;
+  using linear_id_type = size_t;
+  static constexpr int dimensions = Dimensions;
+#endif // __DISABLE_SYCL_INTEL_GROUP_ALGORITHMS__
+
   group() = delete;
 
-  id<dimensions> get_id() const { return index; }
+  id<Dimensions> get_id() const { return index; }
 
   size_t get_id(int dimension) const { return index[dimension]; }
 
-  range<dimensions> get_global_range() const { return globalRange; }
+  range<Dimensions> get_global_range() const { return globalRange; }
 
   size_t get_global_range(int dimension) const {
     return globalRange[dimension];
   }
 
-  range<dimensions> get_local_range() const { return localRange; }
+  range<Dimensions> get_local_range() const { return localRange; }
 
   size_t get_local_range(int dimension) const { return localRange[dimension]; }
 
-  range<dimensions> get_group_range() const { return groupRange; }
+  range<Dimensions> get_group_range() const { return groupRange; }
 
   size_t get_group_range(int dimension) const {
     return get_group_range()[dimension];
@@ -107,12 +114,12 @@ template <int dimensions = 1> class group {
 
   size_t operator[](int dimension) const { return index[dimension]; }
 
-  template <int dims = dimensions>
+  template <int dims = Dimensions>
   typename std::enable_if<(dims == 1), size_t>::type get_linear_id() const {
     return index[0];
   }
 
-  template <int dims = dimensions>
+  template <int dims = Dimensions>
   typename std::enable_if<(dims == 2), size_t>::type get_linear_id() const {
     return index[0] * groupRange[1] + index[1];
   }
@@ -127,7 +134,7 @@ template <int dimensions = 1> class group {
   //    size_t get_linear_id()const
   //    Get a linearized version of the work-group id. Calculating a linear
   //    work-group id from a multi-dimensional index follows the equation 4.3.
-  template <int dims = dimensions>
+  template <int dims = Dimensions>
   typename std::enable_if<(dims == 3), size_t>::type get_linear_id() const {
     return (index[0] * groupRange[1] * groupRange[2]) +
            (index[1] * groupRange[2]) + index[2];
@@ -139,41 +146,41 @@ template <int dimensions = 1> class group {
     // compilers are expected to optimize when possible
     detail::workGroupBarrier();
 #ifdef __SYCL_DEVICE_ONLY__
-    range<dimensions> GlobalSize{
-        __spirv::initGlobalSize<dimensions, range<dimensions>>()};
-    range<dimensions> LocalSize{
-        __spirv::initWorkgroupSize<dimensions, range<dimensions>>()};
-    id<dimensions> GlobalId{
-        __spirv::initGlobalInvocationId<dimensions, id<dimensions>>()};
-    id<dimensions> LocalId{
-        __spirv::initLocalInvocationId<dimensions, id<dimensions>>()};
+    range<Dimensions> GlobalSize{
+        __spirv::initGlobalSize<Dimensions, range<Dimensions>>()};
+    range<Dimensions> LocalSize{
+        __spirv::initWorkgroupSize<Dimensions, range<Dimensions>>()};
+    id<Dimensions> GlobalId{
+        __spirv::initGlobalInvocationId<Dimensions, id<Dimensions>>()};
+    id<Dimensions> LocalId{
+        __spirv::initLocalInvocationId<Dimensions, id<Dimensions>>()};
 
     // no 'iterate' in the device code variant, because
     // (1) this code is already invoked by each work item as a part of the
     //     enclosing parallel_for_work_group kernel
     // (2) the range this pfwi iterates over matches work group size exactly
-    item<dimensions, false> GlobalItem =
-        detail::Builder::createItem<dimensions, false>(GlobalSize, GlobalId);
-    item<dimensions, false> LocalItem =
-        detail::Builder::createItem<dimensions, false>(LocalSize, LocalId);
-    h_item<dimensions> HItem =
-        detail::Builder::createHItem<dimensions>(GlobalItem, LocalItem);
+    item<Dimensions, false> GlobalItem =
+        detail::Builder::createItem<Dimensions, false>(GlobalSize, GlobalId);
+    item<Dimensions, false> LocalItem =
+        detail::Builder::createItem<Dimensions, false>(LocalSize, LocalId);
+    h_item<Dimensions> HItem =
+        detail::Builder::createHItem<Dimensions>(GlobalItem, LocalItem);
 
     Func(HItem);
 #else
-    id<dimensions> GroupStartID = index * localRange;
+    id<Dimensions> GroupStartID = index * localRange;
 
     // ... host variant needs explicit 'iterate' because it is serial
-    detail::NDLoop<dimensions>::iterate(
-        localRange, [&](const id<dimensions> &LocalID) {
-          item<dimensions, false> GlobalItem =
-              detail::Builder::createItem<dimensions, false>(
+    detail::NDLoop<Dimensions>::iterate(
+        localRange, [&](const id<Dimensions> &LocalID) {
+          item<Dimensions, false> GlobalItem =
+              detail::Builder::createItem<Dimensions, false>(
                   globalRange, GroupStartID + LocalID);
-          item<dimensions, false> LocalItem =
-              detail::Builder::createItem<dimensions, false>(localRange,
+          item<Dimensions, false> LocalItem =
+              detail::Builder::createItem<Dimensions, false>(localRange,
                                                              LocalID);
-          h_item<dimensions> HItem =
-              detail::Builder::createHItem<dimensions>(GlobalItem, LocalItem);
+          h_item<Dimensions> HItem =
+              detail::Builder::createHItem<Dimensions>(GlobalItem, LocalItem);
           Func(HItem);
         });
 #endif // __SYCL_DEVICE_ONLY__
@@ -185,52 +192,52 @@ template <int dimensions = 1> class group {
   }
 
   template <typename WorkItemFunctionT>
-  void parallel_for_work_item(range<dimensions> flexibleRange,
+  void parallel_for_work_item(range<Dimensions> flexibleRange,
                               WorkItemFunctionT Func) const {
     detail::workGroupBarrier();
 #ifdef __SYCL_DEVICE_ONLY__
-    range<dimensions> GlobalSize{
-        __spirv::initGlobalSize<dimensions, range<dimensions>>()};
-    range<dimensions> LocalSize{
-        __spirv::initWorkgroupSize<dimensions, range<dimensions>>()};
-    id<dimensions> GlobalId{
-        __spirv::initGlobalInvocationId<dimensions, id<dimensions>>()};
-    id<dimensions> LocalId{
-        __spirv::initLocalInvocationId<dimensions, id<dimensions>>()};
-
-    item<dimensions, false> GlobalItem =
-        detail::Builder::createItem<dimensions, false>(GlobalSize, GlobalId);
-    item<dimensions, false> LocalItem =
-        detail::Builder::createItem<dimensions, false>(LocalSize, LocalId);
-    h_item<dimensions> HItem = detail::Builder::createHItem<dimensions>(
+    range<Dimensions> GlobalSize{
+        __spirv::initGlobalSize<Dimensions, range<Dimensions>>()};
+    range<Dimensions> LocalSize{
+        __spirv::initWorkgroupSize<Dimensions, range<Dimensions>>()};
+    id<Dimensions> GlobalId{
+        __spirv::initGlobalInvocationId<Dimensions, id<Dimensions>>()};
+    id<Dimensions> LocalId{
+        __spirv::initLocalInvocationId<Dimensions, id<Dimensions>>()};
+
+    item<Dimensions, false> GlobalItem =
+        detail::Builder::createItem<Dimensions, false>(GlobalSize, GlobalId);
+    item<Dimensions, false> LocalItem =
+        detail::Builder::createItem<Dimensions, false>(LocalSize, LocalId);
+    h_item<Dimensions> HItem = detail::Builder::createHItem<Dimensions>(
         GlobalItem, LocalItem, flexibleRange);
 
     // iterate over flexible range with work group size stride; each item
     // performs flexibleRange/LocalSize iterations (if the former is divisible
     // by the latter)
-    detail::NDLoop<dimensions>::iterate(
+    detail::NDLoop<Dimensions>::iterate(
         LocalId, LocalSize, flexibleRange,
-        [&](const id<dimensions> &LogicalLocalID) {
+        [&](const id<Dimensions> &LogicalLocalID) {
           HItem.setLogicalLocalID(LogicalLocalID);
           Func(HItem);
         });
 #else
-    id<dimensions> GroupStartID = index * localRange;
+    id<Dimensions> GroupStartID = index * localRange;
 
-    detail::NDLoop<dimensions>::iterate(
-        localRange, [&](const id<dimensions> &LocalID) {
-          item<dimensions, false> GlobalItem =
-              detail::Builder::createItem<dimensions, false>(
+    detail::NDLoop<Dimensions>::iterate(
+        localRange, [&](const id<Dimensions> &LocalID) {
+          item<Dimensions, false> GlobalItem =
+              detail::Builder::createItem<Dimensions, false>(
                   globalRange, GroupStartID + LocalID);
-          item<dimensions, false> LocalItem =
-              detail::Builder::createItem<dimensions, false>(localRange,
+          item<Dimensions, false> LocalItem =
+              detail::Builder::createItem<Dimensions, false>(localRange,
                                                              LocalID);
-          h_item<dimensions> HItem = detail::Builder::createHItem<dimensions>(
+          h_item<Dimensions> HItem = detail::Builder::createHItem<Dimensions>(
               GlobalItem, LocalItem, flexibleRange);
 
-          detail::NDLoop<dimensions>::iterate(
+          detail::NDLoop<Dimensions>::iterate(
               LocalID, localRange, flexibleRange,
-              [&](const id<dimensions> &LogicalLocalID) {
+              [&](const id<Dimensions> &LogicalLocalID) {
                 HItem.setLogicalLocalID(LogicalLocalID);
                 Func(HItem);
               });
@@ -311,7 +318,7 @@ template <int dimensions = 1> class group {
     waitForHelper(Events...);
   }
 
-  bool operator==(const group<dimensions> &rhs) const {
+  bool operator==(const group<Dimensions> &rhs) const {
     bool Result = (rhs.globalRange == globalRange) &&
                   (rhs.localRange == localRange) && (rhs.index == index);
     __SYCL_ASSERT(rhs.groupRange == groupRange &&
@@ -319,15 +326,15 @@ template <int dimensions = 1> class group {
     return Result;
   }
 
-  bool operator!=(const group<dimensions> &rhs) const {
+  bool operator!=(const group<Dimensions> &rhs) const {
     return !((*this) == rhs);
   }
 
 private:
-  range<dimensions> globalRange;
-  range<dimensions> localRange;
-  range<dimensions> groupRange;
-  id<dimensions> index;
+  range<Dimensions> globalRange;
+  range<Dimensions> localRange;
+  range<Dimensions> groupRange;
+  id<Dimensions> index;
 
   void waitForHelper() const {}
 
@@ -343,8 +350,8 @@ template <int dimensions = 1> class group {
 
 protected:
   friend class detail::Builder;
-  group(const range<dimensions> &G, const range<dimensions> &L,
-        const range<dimensions> GroupRange, const id<dimensions> &I)
+  group(const range<Dimensions> &G, const range<Dimensions> &L,
+        const range<Dimensions> GroupRange, const id<Dimensions> &I)
       : globalRange(G), localRange(L), groupRange(GroupRange), index(I) {
     // Make sure local range divides global without remainder:
     __SYCL_ASSERT(((G % L).size() == 0) &&
diff --git a/sycl/include/CL/sycl/intel/functional.hpp b/sycl/include/CL/sycl/intel/functional.hpp
index 0971d9089205c..018f6d0c2a28b 100644
--- a/sycl/include/CL/sycl/intel/functional.hpp
+++ b/sycl/include/CL/sycl/intel/functional.hpp
@@ -44,7 +44,8 @@ template <> struct maximum<void> {
   template <typename T, typename U>
   auto operator()(T &&lhs, U &&rhs) const ->
       typename std::common_type<T &&, U &&>::type {
-    return std::greater<>()(std::forward<const T>(lhs), std::forward<const U>(rhs))
+    return std::greater<>()(std::forward<const T>(lhs),
+                            std::forward<const U>(rhs))
                ? std::forward<T>(lhs)
                : std::forward<U>(rhs);
   }
@@ -54,5 +55,60 @@ template <> struct maximum<void> {
 template <typename T = void> using plus = std::plus<T>;
 
 } // namespace intel
+
+#ifdef __SYCL_DEVICE_ONLY__
+namespace detail {
+
+struct GroupOpISigned {};
+struct GroupOpIUnsigned {};
+struct GroupOpFP {};
+
+template <typename T, typename = void> struct GroupOpTag;
+
+template <typename T>
+struct GroupOpTag<T, detail::enable_if_t<detail::is_sigeninteger<T>::value>> {
+  using type = GroupOpISigned;
+};
+
+template <typename T>
+struct GroupOpTag<T, detail::enable_if_t<detail::is_sugeninteger<T>::value>> {
+  using type = GroupOpIUnsigned;
+};
+
+template <typename T>
+struct GroupOpTag<T, detail::enable_if_t<detail::is_sgenfloat<T>::value>> {
+  using type = GroupOpFP;
+};
+
+#define __SYCL_CALC_OVERLOAD(GroupTag, SPIRVOperation, BinaryOperation)        \
+  template <typename T, __spv::GroupOperation O, __spv::Scope S>               \
+  static T calc(GroupTag, T x, BinaryOperation op) {                           \
+    using OCLT = detail::ConvertToOpenCLType_t<T>;                             \
+    OCLT Arg = x;                                                              \
+    OCLT Ret = __spirv_Group##SPIRVOperation(S, O, Arg);                       \
+    return Ret;                                                                \
+  }
+
+__SYCL_CALC_OVERLOAD(GroupOpISigned, SMin, intel::minimum<T>)
+__SYCL_CALC_OVERLOAD(GroupOpIUnsigned, UMin, intel::minimum<T>)
+__SYCL_CALC_OVERLOAD(GroupOpFP, FMin, intel::minimum<T>)
+__SYCL_CALC_OVERLOAD(GroupOpISigned, SMax, intel::maximum<T>)
+__SYCL_CALC_OVERLOAD(GroupOpIUnsigned, UMax, intel::maximum<T>)
+__SYCL_CALC_OVERLOAD(GroupOpFP, FMax, intel::maximum<T>)
+__SYCL_CALC_OVERLOAD(GroupOpISigned, IAdd, intel::plus<T>)
+__SYCL_CALC_OVERLOAD(GroupOpIUnsigned, IAdd, intel::plus<T>)
+__SYCL_CALC_OVERLOAD(GroupOpFP, FAdd, intel::plus<T>)
+
+#undef __SYCL_CALC_OVERLOAD
+
+template <typename T, __spv::GroupOperation O, __spv::Scope S,
+          template <typename> class BinaryOperation>
+static T calc(typename GroupOpTag<T>::type, T x, BinaryOperation<void>) {
+  return calc<T, O, S>(typename GroupOpTag<T>::type(), x, BinaryOperation<T>());
+}
+
+} // namespace detail
+#endif // __SYCL_DEVICE_ONLY__
+
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/include/CL/sycl/intel/group_algorithm.hpp b/sycl/include/CL/sycl/intel/group_algorithm.hpp
new file mode 100644
index 0000000000000..ad8fa67313d91
--- /dev/null
+++ b/sycl/include/CL/sycl/intel/group_algorithm.hpp
@@ -0,0 +1,666 @@
+//==----------- group_algorithm.hpp --- SYCL group algorithm----------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/__spirv/spirv_ops.hpp>
+#include <CL/__spirv/spirv_types.hpp>
+#include <CL/__spirv/spirv_vars.hpp>
+#include <CL/sycl/detail/spirv.hpp>
+#include <CL/sycl/detail/type_traits.hpp>
+#include <CL/sycl/group.hpp>
+#include <CL/sycl/intel/functional.hpp>
+
+#ifndef __DISABLE_SYCL_INTEL_GROUP_ALGORITHMS__
+__SYCL_INLINE_NAMESPACE(cl) {
+namespace sycl {
+namespace detail {
+
+template <typename Group> size_t get_local_linear_range(Group g);
+template <> inline size_t get_local_linear_range<group<1>>(group<1> g) {
+  return g.get_local_range(0);
+}
+template <> inline size_t get_local_linear_range<group<2>>(group<2> g) {
+  return g.get_local_range(0) * g.get_local_range(1);
+}
+template <> inline size_t get_local_linear_range<group<3>>(group<3> g) {
+  return g.get_local_range(0) * g.get_local_range(1) * g.get_local_range(2);
+}
+
+template <int Dimensions>
+id<Dimensions> linear_id_to_id(range<Dimensions>, size_t linear_id);
+template <> inline id<1> linear_id_to_id(range<1> r, size_t linear_id) {
+  return id<1>(linear_id);
+}
+template <> inline id<2> linear_id_to_id(range<2> r, size_t linear_id) {
+  id<2> result;
+  result[0] = linear_id / r[1];
+  result[1] = linear_id % r[1];
+  return result;
+}
+template <> inline id<3> linear_id_to_id(range<3> r, size_t linear_id) {
+  id<3> result;
+  result[0] = linear_id / (r[1] * r[2]);
+  result[1] = (linear_id % (r[1] * r[2])) / r[2];
+  result[2] = linear_id % r[2];
+  return result;
+}
+
+template <typename T> struct is_group : std::false_type {};
+
+template <int Dimensions>
+struct is_group<group<Dimensions>> : std::true_type {};
+
+template <typename T, class BinaryOperation> struct identity {};
+
+template <typename T, typename V> struct identity<T, intel::plus<V>> {
+  static constexpr T value = 0;
+};
+
+template <typename T, typename V> struct identity<T, intel::minimum<V>> {
+  static constexpr T value = std::numeric_limits<T>::max();
+};
+
+template <typename T, typename V> struct identity<T, intel::maximum<V>> {
+  static constexpr T value = std::numeric_limits<T>::lowest();
+};
+
+template <typename Group, typename Ptr, class Function>
+Function for_each(Group g, Ptr first, Ptr last, Function f) {
+#ifdef __SYCL_DEVICE_ONLY__
+  nd_item<Group::dimensions> it =
+      cl::sycl::detail::Builder::getNDItem<Group::dimensions>();
+  ptrdiff_t offset = it.get_local_linear_id();
+  ptrdiff_t stride = detail::get_local_linear_range(g);
+  for (Ptr p = first + offset; p < last; p += stride) {
+    f(*p);
+  }
+  return f;
+#else
+  throw runtime_error("Group algorithms are not supported on host device.",
+                      PI_INVALID_DEVICE);
+#endif
+}
+
+} // namespace detail
+
+namespace intel {
+
+template <typename T>
+using EnableIfIsScalarArithmetic = cl::sycl::detail::enable_if_t<
+    cl::sycl::detail::is_scalar_arithmetic<T>::value, T>;
+
+template <typename T>
+using EnableIfIsVectorArithmetic = cl::sycl::detail::enable_if_t<
+    cl::sycl::detail::is_vector_arithmetic<T>::value, T>;
+
+template <typename Ptr, typename T>
+using EnableIfIsPointer =
+    cl::sycl::detail::enable_if_t<cl::sycl::detail::is_pointer<Ptr>::value, T>;
+
+template <typename Group> bool all_of(Group g, bool pred) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+#ifdef __SYCL_DEVICE_ONLY__
+  return __spirv_GroupAll(__spv::Scope::Workgroup, pred);
+#else
+  throw runtime_error("Group algorithms are not supported on host device.",
+                      PI_INVALID_DEVICE);
+#endif
+}
+
+template <typename Group, typename T, class Predicate>
+bool all_of(Group g, T x, Predicate pred) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+  return all_of(g, pred(x));
+}
+
+template <typename Group, typename Ptr, class Predicate>
+EnableIfIsPointer<Ptr, bool> all_of(Group g, Ptr first, Ptr last,
+                                    Predicate pred) {
+#ifdef __SYCL_DEVICE_ONLY__
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+  bool partial = true;
+  detail::for_each(g, first, last, [&](const typename Ptr::element_type &x) {
+    partial &= pred(x);
+  });
+  return all_of(g, partial);
+#else
+  throw runtime_error("Group algorithms are not supported on host device.",
+                      PI_INVALID_DEVICE);
+#endif
+}
+
+template <typename Group> bool any_of(Group g, bool pred) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+#ifdef __SYCL_DEVICE_ONLY__
+  return __spirv_GroupAny(__spv::Scope::Workgroup, pred);
+#else
+  throw runtime_error("Group algorithms are not supported on host device.",
+                      PI_INVALID_DEVICE);
+#endif
+}
+
+template <typename Group, typename T, class Predicate>
+bool any_of(Group g, T x, Predicate pred) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+  return any_of(g, pred(x));
+}
+
+template <typename Group, typename Ptr, class Predicate>
+EnableIfIsPointer<Ptr, bool> any_of(Group g, Ptr first, Ptr last,
+                                    Predicate pred) {
+#ifdef __SYCL_DEVICE_ONLY__
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+  bool partial = false;
+  detail::for_each(g, first, last, [&](const typename Ptr::element_type &x) {
+    partial |= pred(x);
+  });
+  return any_of(g, partial);
+#else
+  throw runtime_error("Group algorithms are not supported on host device.",
+                      PI_INVALID_DEVICE);
+#endif
+}
+
+template <typename Group> bool none_of(Group g, bool pred) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+#ifdef __SYCL_DEVICE_ONLY__
+  return __spirv_GroupAll(__spv::Scope::Workgroup, not pred);
+#else
+  throw runtime_error("Group algorithms are not supported on host device.",
+                      PI_INVALID_DEVICE);
+#endif
+}
+
+template <typename Group, typename T, class Predicate>
+bool none_of(Group g, T x, Predicate pred) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+  return none_of(g, pred(x));
+}
+
+template <typename Group, typename Ptr, class Predicate>
+EnableIfIsPointer<Ptr, bool> none_of(Group g, Ptr first, Ptr last,
+                                     Predicate pred) {
+#ifdef __SYCL_DEVICE_ONLY__
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+  return not any_of(g, first, last, pred);
+#else
+  throw runtime_error("Group algorithms are not supported on host device.",
+                      PI_INVALID_DEVICE);
+#endif
+}
+
+template <typename Group, typename T>
+EnableIfIsScalarArithmetic<T> broadcast(Group g, T x,
+                                        typename Group::id_type local_id) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+#ifdef __SYCL_DEVICE_ONLY__
+  return detail::spirv::GroupBroadcast<__spv::Scope::Workgroup>(x, local_id);
+#else
+  throw runtime_error("Group algorithms are not supported on host device.",
+                      PI_INVALID_DEVICE);
+#endif
+}
+
+template <typename Group, typename T>
+EnableIfIsVectorArithmetic<T> broadcast(Group g, T x,
+                                        typename Group::id_type local_id) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+#ifdef __SYCL_DEVICE_ONLY__
+  T result;
+  for (int s = 0; s < x.get_size(); ++s) {
+    result[s] = broadcast(g, x[s], local_id);
+  }
+  return result;
+#else
+  throw runtime_error("Group algorithms are not supported on host device.",
+                      PI_INVALID_DEVICE);
+#endif
+}
+
+template <typename Group, typename T>
+EnableIfIsScalarArithmetic<T>
+broadcast(Group g, T x, typename Group::linear_id_type linear_local_id) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+#ifdef __SYCL_DEVICE_ONLY__
+  return broadcast(
+      g, x, detail::linear_id_to_id(g.get_local_range(), linear_local_id));
+#else
+  throw runtime_error("Group algorithms are not supported on host device.",
+                      PI_INVALID_DEVICE);
+#endif
+}
+
+template <typename Group, typename T>
+EnableIfIsVectorArithmetic<T>
+broadcast(Group g, T x, typename Group::linear_id_type linear_local_id) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+#ifdef __SYCL_DEVICE_ONLY__
+  T result;
+  for (int s = 0; s < x.get_size(); ++s) {
+    result[s] = broadcast(g, x[s], linear_local_id);
+  }
+  return result;
+#else
+  throw runtime_error("Group algorithms are not supported on host device.",
+                      PI_INVALID_DEVICE);
+#endif
+}
+
+template <typename Group, typename T>
+EnableIfIsScalarArithmetic<T> broadcast(Group g, T x) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+#ifdef __SYCL_DEVICE_ONLY__
+  return broadcast(g, x, 0);
+#else
+  throw runtime_error("Group algorithms are not supported on host device.",
+                      PI_INVALID_DEVICE);
+#endif
+}
+
+template <typename Group, typename T>
+EnableIfIsVectorArithmetic<T> broadcast(Group g, T x) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+#ifdef __SYCL_DEVICE_ONLY__
+  T result;
+  for (int s = 0; s < x.get_size(); ++s) {
+    result[s] = broadcast(g, x[s]);
+  }
+  return result;
+#else
+  throw runtime_error("Group algorithms are not supported on host device.",
+                      PI_INVALID_DEVICE);
+#endif
+}
+
+template <typename Group, typename T, class BinaryOperation>
+EnableIfIsScalarArithmetic<T> reduce(Group g, T x, BinaryOperation binary_op) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+  static_assert(
+      std::is_same<decltype(binary_op(x, x)), T>::value,
+      "Result type of binary_op must match reduction accumulation type.");
+#ifdef __SYCL_DEVICE_ONLY__
+  return detail::calc<T, __spv::GroupOperation::Reduce,
+                      __spv::Scope::Workgroup>(
+      typename detail::GroupOpTag<T>::type(), x, binary_op);
+#else
+  throw runtime_error("Group algorithms are not supported on host device.",
+                      PI_INVALID_DEVICE);
+#endif
+}
+
+template <typename Group, typename T, class BinaryOperation>
+EnableIfIsVectorArithmetic<T> reduce(Group g, T x, BinaryOperation binary_op) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+  static_assert(
+      std::is_same<decltype(binary_op(x[0], x[0])),
+                   typename T::element_type>::value,
+      "Result type of binary_op must match reduction accumulation type.");
+  T result;
+  for (int s = 0; s < x.get_size(); ++s) {
+    result[s] = reduce(g, x[s], binary_op);
+  }
+  return result;
+}
+
+template <typename Group, typename V, typename T, class BinaryOperation>
+EnableIfIsScalarArithmetic<T> reduce(Group g, V x, T init,
+                                     BinaryOperation binary_op) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+  static_assert(
+      std::is_same<decltype(binary_op(init, x)), T>::value,
+      "Result type of binary_op must match reduction accumulation type.");
+#ifdef __SYCL_DEVICE_ONLY__
+  return binary_op(init, reduce(g, x, binary_op));
+#else
+  throw runtime_error("Group algorithms are not supported on host device.",
+                      PI_INVALID_DEVICE);
+#endif
+}
+
+template <typename Group, typename V, typename T, class BinaryOperation>
+EnableIfIsVectorArithmetic<T> reduce(Group g, V x, T init,
+                                     BinaryOperation binary_op) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+  static_assert(
+      std::is_same<decltype(binary_op(init[0], x[0])),
+                   typename T::element_type>::value,
+      "Result type of binary_op must match reduction accumulation type.");
+#ifdef __SYCL_DEVICE_ONLY__
+  T result = init;
+  for (int s = 0; s < x.get_size(); ++s) {
+    result[s] = binary_op(init[s], reduce(g, x[s], binary_op));
+  }
+  return result;
+#else
+  throw runtime_error("Group algorithms are not supported on host device.",
+                      PI_INVALID_DEVICE);
+#endif
+}
+
+template <typename Group, typename Ptr, class BinaryOperation>
+EnableIfIsPointer<Ptr, typename Ptr::element_type>
+reduce(Group g, Ptr first, Ptr last, BinaryOperation binary_op) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+  static_assert(
+      std::is_same<decltype(binary_op(*first, *first)),
+                   typename Ptr::element_type>::value,
+      "Result type of binary_op must match reduction accumulation type.");
+#ifdef __SYCL_DEVICE_ONLY__
+  typename Ptr::element_type partial =
+      detail::identity<typename Ptr::element_type, BinaryOperation>::value;
+  detail::for_each(g, first, last, [&](const typename Ptr::element_type &x) {
+    partial = binary_op(partial, x);
+  });
+  return reduce(g, partial, binary_op);
+#else
+  throw runtime_error("Group algorithms are not supported on host device.",
+                      PI_INVALID_DEVICE);
+#endif
+}
+
+template <typename Group, typename Ptr, typename T, class BinaryOperation>
+EnableIfIsPointer<Ptr, T> reduce(Group g, Ptr first, Ptr last, T init,
+                                 BinaryOperation binary_op) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+  static_assert(
+      std::is_same<decltype(binary_op(init, *first)), T>::value,
+      "Result type of binary_op must match reduction accumulation type.");
+#ifdef __SYCL_DEVICE_ONLY__
+  T partial =
+      detail::identity<typename Ptr::element_type, BinaryOperation>::value;
+  detail::for_each(g, first, last, [&](const typename Ptr::element_type &x) {
+    partial = binary_op(partial, x);
+  });
+  return reduce(g, partial, init, binary_op);
+#else
+  throw runtime_error("Group algorithms are not supported on host device.",
+                      PI_INVALID_DEVICE);
+#endif
+}
+
+template <typename Group, typename T, class BinaryOperation>
+EnableIfIsScalarArithmetic<T> exclusive_scan(Group g, T x,
+                                             BinaryOperation binary_op) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+  static_assert(std::is_same<decltype(binary_op(x, x)), T>::value,
+                "Result type of binary_op must match scan accumulation type.");
+#ifdef __SYCL_DEVICE_ONLY__
+  return detail::calc<T, __spv::GroupOperation::ExclusiveScan,
+                      __spv::Scope::Workgroup>(
+      typename detail::GroupOpTag<T>::type(), x, binary_op);
+#else
+  throw runtime_error("Group algorithms are not supported on host device.",
+                      PI_INVALID_DEVICE);
+#endif
+}
+
+template <typename Group, typename T, class BinaryOperation>
+EnableIfIsVectorArithmetic<T> exclusive_scan(Group g, T x,
+                                             BinaryOperation binary_op) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+  static_assert(std::is_same<decltype(binary_op(x[0], x[0])),
+                             typename T::element_type>::value,
+                "Result type of binary_op must match scan accumulation type.");
+  T result;
+  for (int s = 0; s < x.get_size(); ++s) {
+    result[s] = exclusive_scan(g, x[s], binary_op);
+  }
+  return result;
+}
+
+template <typename Group, typename V, typename T, class BinaryOperation>
+EnableIfIsVectorArithmetic<T> exclusive_scan(Group g, V x, T init,
+                                             BinaryOperation binary_op) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+  static_assert(std::is_same<decltype(binary_op(init[0], x[0])),
+                             typename T::element_type>::value,
+                "Result type of binary_op must match scan accumulation type.");
+  T result;
+  for (int s = 0; s < x.get_size(); ++s) {
+    result[s] = exclusive_scan(g, x[s], init[s], binary_op);
+  }
+  return result;
+}
+
+template <typename Group, typename V, typename T, class BinaryOperation>
+EnableIfIsScalarArithmetic<T> exclusive_scan(Group g, V x, T init,
+                                             BinaryOperation binary_op) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+  static_assert(std::is_same<decltype(binary_op(init, x)), T>::value,
+                "Result type of binary_op must match scan accumulation type.");
+#ifdef __SYCL_DEVICE_ONLY__
+  nd_item<Group::dimensions> it =
+      detail::Builder::getNDItem<Group::dimensions>();
+  if (it.get_local_linear_id() == 0) {
+    x = binary_op(init, x);
+  }
+  T scan = exclusive_scan(g, x, binary_op);
+  if (it.get_local_linear_id() == 0) {
+    scan = init;
+  }
+  return scan;
+#else
+  throw runtime_error("Group algorithms are not supported on host device.",
+                      PI_INVALID_DEVICE);
+#endif
+}
+
+template <typename Group, typename InPtr, typename OutPtr, typename T,
+          class BinaryOperation>
+EnableIfIsPointer<InPtr, OutPtr>
+exclusive_scan(Group g, InPtr first, InPtr last, OutPtr result, T init,
+               BinaryOperation binary_op) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+  static_assert(std::is_same<decltype(binary_op(*first, *first)), T>::value,
+                "Result type of binary_op must match scan accumulation type.");
+#ifdef __SYCL_DEVICE_ONLY__
+  nd_item<Group::dimensions> it =
+      cl::sycl::detail::Builder::getNDItem<Group::dimensions>();
+  ptrdiff_t offset = it.get_local_linear_id();
+  ptrdiff_t stride = detail::get_local_linear_range(g);
+  ptrdiff_t N = last - first;
+  auto roundup = [=](const ptrdiff_t &v,
+                     const ptrdiff_t &divisor) -> ptrdiff_t {
+    return ((v + divisor - 1) / divisor) * divisor;
+  };
+  typename InPtr::element_type x;
+  typename OutPtr::element_type carry = init;
+  for (ptrdiff_t chunk = 0; chunk < roundup(N, stride); chunk += stride) {
+    ptrdiff_t i = chunk + offset;
+    if (i < N) {
+      x = first[i];
+    }
+    typename OutPtr::element_type out = exclusive_scan(g, x, carry, binary_op);
+    if (i < N) {
+      result[i] = out;
+    }
+    carry = broadcast(g, binary_op(out, x), stride - 1);
+  }
+  return result + N;
+#else
+  throw runtime_error("Group algorithms are not supported on host device.",
+                      PI_INVALID_DEVICE);
+#endif
+}
+
+template <typename Group, typename InPtr, typename OutPtr,
+          class BinaryOperation>
+EnableIfIsPointer<InPtr, OutPtr> exclusive_scan(Group g, InPtr first,
+                                                InPtr last, OutPtr result,
+                                                BinaryOperation binary_op) {
+  static_assert(std::is_same<decltype(binary_op(*first, *first)),
+                             typename OutPtr::element_type>::value,
+                "Result type of binary_op must match scan accumulation type.");
+  return exclusive_scan(
+      g, first, last, result,
+      detail::identity<typename OutPtr::element_type, BinaryOperation>::value,
+      binary_op);
+}
+
+template <typename Group, typename T, class BinaryOperation>
+EnableIfIsVectorArithmetic<T> inclusive_scan(Group g, T x,
+                                             BinaryOperation binary_op) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+  static_assert(std::is_same<decltype(binary_op(x[0], x[0])),
+                             typename T::element_type>::value,
+                "Result type of binary_op must match scan accumulation type.");
+  T result;
+  for (int s = 0; s < x.get_size(); ++s) {
+    result[s] = inclusive_scan(g, x[s], binary_op);
+  }
+  return result;
+}
+
+template <typename Group, typename T, class BinaryOperation>
+EnableIfIsScalarArithmetic<T> inclusive_scan(Group g, T x,
+                                             BinaryOperation binary_op) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+  static_assert(std::is_same<decltype(binary_op(x, x)), T>::value,
+                "Result type of binary_op must match scan accumulation type.");
+#ifdef __SYCL_DEVICE_ONLY__
+  return detail::calc<T, __spv::GroupOperation::InclusiveScan,
+                      __spv::Scope::Workgroup>(
+      typename detail::GroupOpTag<T>::type(), x, binary_op);
+#else
+  throw runtime_error("Group algorithms are not supported on host device.",
+                      PI_INVALID_DEVICE);
+#endif
+}
+
+template <typename Group, typename V, class BinaryOperation, typename T>
+EnableIfIsScalarArithmetic<T>
+inclusive_scan(Group g, V x, BinaryOperation binary_op, T init) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+  static_assert(std::is_same<decltype(binary_op(init, x)), T>::value,
+                "Result type of binary_op must match scan accumulation type.");
+#ifdef __SYCL_DEVICE_ONLY__
+  nd_item<Group::dimensions> it =
+      detail::Builder::getNDItem<Group::dimensions>();
+  if (it.get_local_linear_id() == 0) {
+    x = binary_op(init, x);
+  }
+  return inclusive_scan(g, x, binary_op);
+#else
+  throw runtime_error("Group algorithms are not supported on host device.",
+                      PI_INVALID_DEVICE);
+#endif
+}
+
+template <typename Group, typename V, class BinaryOperation, typename T>
+EnableIfIsVectorArithmetic<T>
+inclusive_scan(Group g, V x, BinaryOperation binary_op, T init) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+  static_assert(std::is_same<decltype(binary_op(init[0], x[0])), T>::value,
+                "Result type of binary_op must match scan accumulation type.");
+  T result;
+  for (int s = 0; s < x.get_size(); ++s) {
+    result[s] = inclusive_scan(g, x[s], binary_op, init[s]);
+  }
+  return result;
+}
+
+template <typename Group, typename InPtr, typename OutPtr,
+          class BinaryOperation, typename T>
+EnableIfIsPointer<InPtr, OutPtr>
+inclusive_scan(Group g, InPtr first, InPtr last, OutPtr result,
+               BinaryOperation binary_op, T init) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+  static_assert(std::is_same<decltype(binary_op(init, *first)), T>::value,
+                "Result type of binary_op must match scan accumulation type.");
+#ifdef __SYCL_DEVICE_ONLY__
+  nd_item<Group::dimensions> it =
+      cl::sycl::detail::Builder::getNDItem<Group::dimensions>();
+  ptrdiff_t offset = it.get_local_linear_id();
+  ptrdiff_t stride = detail::get_local_linear_range(g);
+  ptrdiff_t N = last - first;
+  auto roundup = [=](const ptrdiff_t &v,
+                     const ptrdiff_t &divisor) -> ptrdiff_t {
+    return ((v + divisor - 1) / divisor) * divisor;
+  };
+  typename InPtr::element_type x;
+  typename OutPtr::element_type carry = init;
+  for (ptrdiff_t chunk = 0; chunk < roundup(N, stride); chunk += stride) {
+    ptrdiff_t i = chunk + offset;
+    if (i < N) {
+      x = first[i];
+    }
+    typename OutPtr::element_type out = inclusive_scan(g, x, binary_op, carry);
+    if (i < N) {
+      result[i] = out;
+    }
+    carry = broadcast(g, out, stride - 1);
+  }
+  return result + N;
+#else
+  throw runtime_error("Group algorithms are not supported on host device.",
+                      PI_INVALID_DEVICE);
+#endif
+}
+
+template <typename Group, typename InPtr, typename OutPtr,
+          class BinaryOperation>
+EnableIfIsPointer<InPtr, OutPtr> inclusive_scan(Group g, InPtr first,
+                                                InPtr last, OutPtr result,
+                                                BinaryOperation binary_op) {
+  static_assert(std::is_same<decltype(binary_op(*first, *first)),
+                             typename OutPtr::element_type>::value,
+                "Result type of binary_op must match scan accumulation type.");
+  return inclusive_scan(
+      g, first, last, result, binary_op,
+      detail::identity<typename OutPtr::element_type, BinaryOperation>::value);
+}
+
+template <typename Group> bool leader(Group g) {
+  static_assert(detail::is_group<Group>::value,
+                "Group algorithms only support the sycl::group class.");
+#ifdef __SYCL_DEVICE_ONLY__
+  nd_item<Group::dimensions> it =
+      cl::sycl::detail::Builder::getNDItem<Group::dimensions>();
+  typename Group::linear_id_type linear_id = it.get_local_linear_id();
+  return (linear_id == 0);
+#else
+  throw runtime_error("Group algorithms are not supported on host device.",
+                      PI_INVALID_DEVICE);
+#endif
+}
+
+} // namespace intel
+} // namespace sycl
+} // __SYCL_INLINE_NAMESPACE(cl)
+#endif // __DISABLE_SYCL_INTEL_GROUP_ALGORITHMS__
diff --git a/sycl/include/CL/sycl/intel/sub_group.hpp b/sycl/include/CL/sycl/intel/sub_group.hpp
index c8326eaa0728b..12dfb0eb262f7 100644
--- a/sycl/include/CL/sycl/intel/sub_group.hpp
+++ b/sycl/include/CL/sycl/intel/sub_group.hpp
@@ -14,6 +14,7 @@
 #include <CL/sycl/detail/defines.hpp>
 #include <CL/sycl/detail/generic_type_traits.hpp>
 #include <CL/sycl/detail/helpers.hpp>
+#include <CL/sycl/detail/spirv.hpp>
 #include <CL/sycl/detail/type_traits.hpp>
 #include <CL/sycl/id.hpp>
 #include <CL/sycl/intel/functional.hpp>
@@ -33,12 +34,6 @@ namespace detail {
 
 namespace sub_group {
 
-template <typename T> T broadcast(T x, id<1> local_id) {
-  using OCLT = detail::ConvertToOpenCLType_t<T>;
-  return __spirv_GroupBroadcast(__spv::Scope::Subgroup, OCLT(x),
-                                local_id.get(0));
-}
-
 #define __SYCL_SG_GENERATE_BODY_1ARG(name, SPIRVOperation)                     \
   template <typename T> T name(T x, id<1> local_id) {                          \
     using OCLT = detail::ConvertToOpenCLType_t<T>;                             \
@@ -130,52 +125,6 @@ void store(multi_ptr<T, Space> dst, const vec<T, N> &x) {
                                   bit_cast<VecT>(x));
 }
 
-struct GroupOpISigned {}; struct GroupOpIUnsigned {}; struct GroupOpFP {};
-
-template <typename T, typename = void> struct GroupOpTag;
-
-template <typename T>
-struct GroupOpTag<T, detail::enable_if_t<detail::is_sigeninteger<T>::value>> {
-  using type = GroupOpISigned;
-};
-
-template <typename T>
-struct GroupOpTag<T, detail::enable_if_t<detail::is_sugeninteger<T>::value>> {
-  using type = GroupOpIUnsigned;
-};
-
-template <typename T>
-struct GroupOpTag<T, detail::enable_if_t<detail::is_sgenfloat<T>::value>> {
-  using type = GroupOpFP;
-};
-
-#define __SYCL_SG_CALC_OVERLOAD(GroupTag, SPIRVOperation, BinaryOperation)     \
-  template <typename T, __spv::GroupOperation O>                               \
-  static T calc(GroupTag, T x, BinaryOperation op) {                           \
-    using OCLT = detail::ConvertToOpenCLType_t<T>;                             \
-    OCLT Arg = x;                                                              \
-    OCLT Ret = __spirv_Group##SPIRVOperation(__spv::Scope::Subgroup, O, Arg);  \
-    return Ret;                                                                \
-  }
-
-__SYCL_SG_CALC_OVERLOAD(GroupOpISigned, SMin, intel::minimum<T>)
-__SYCL_SG_CALC_OVERLOAD(GroupOpIUnsigned, UMin, intel::minimum<T>)
-__SYCL_SG_CALC_OVERLOAD(GroupOpFP, FMin, intel::minimum<T>)
-__SYCL_SG_CALC_OVERLOAD(GroupOpISigned, SMax, intel::maximum<T>)
-__SYCL_SG_CALC_OVERLOAD(GroupOpIUnsigned, UMax, intel::maximum<T>)
-__SYCL_SG_CALC_OVERLOAD(GroupOpFP, FMax, intel::maximum<T>)
-__SYCL_SG_CALC_OVERLOAD(GroupOpISigned, IAdd, intel::plus<T>)
-__SYCL_SG_CALC_OVERLOAD(GroupOpIUnsigned, IAdd, intel::plus<T>)
-__SYCL_SG_CALC_OVERLOAD(GroupOpFP, FAdd, intel::plus<T>)
-
-#undef __SYCL_SG_CALC_OVERLOAD
-
-template <typename T, __spv::GroupOperation O,
-          template <typename> class BinaryOperation>
-static T calc(typename GroupOpTag<T>::type, T x, BinaryOperation<void>) {
-  return calc<T, O>(typename GroupOpTag<T>::type(), x, BinaryOperation<T>());
-}
-
 } // namespace sub_group
 
 } // namespace detail
@@ -213,20 +162,21 @@ struct sub_group {
   }
 
   template <typename T>
-  using EnableIfIsScalarArithmetic = detail::enable_if_t<
-    !detail::is_vec<T>::value && detail::is_arithmetic<T>::value, T>;
+  using EnableIfIsScalarArithmetic =
+      detail::enable_if_t<detail::is_scalar_arithmetic<T>::value, T>;
 
   /* --- collectives --- */
 
   template <typename T>
   EnableIfIsScalarArithmetic<T> broadcast(T x, id<1> local_id) const {
-    return detail::sub_group::broadcast(x, local_id);
+    return detail::spirv::GroupBroadcast<__spv::Scope::Subgroup>(x, local_id);
   }
 
   template <typename T, class BinaryOperation>
   EnableIfIsScalarArithmetic<T> reduce(T x, BinaryOperation op) const {
-    return detail::sub_group::calc<T, __spv::GroupOperation::Reduce>(
-        typename detail::sub_group::GroupOpTag<T>::type(), x, op);
+    return detail::calc<T, __spv::GroupOperation::Reduce,
+                        __spv::Scope::Subgroup>(
+        typename detail::GroupOpTag<T>::type(), x, op);
   }
 
   template <typename T, class BinaryOperation>
@@ -236,8 +186,9 @@ struct sub_group {
 
   template <typename T, class BinaryOperation>
   EnableIfIsScalarArithmetic<T> exclusive_scan(T x, BinaryOperation op) const {
-    return detail::sub_group::calc<T, __spv::GroupOperation::ExclusiveScan>(
-        typename detail::sub_group::GroupOpTag<T>::type(), x, op);
+    return detail::calc<T, __spv::GroupOperation::ExclusiveScan,
+                        __spv::Scope::Subgroup>(
+        typename detail::GroupOpTag<T>::type(), x, op);
   }
 
   template <typename T, class BinaryOperation>
@@ -255,13 +206,14 @@ struct sub_group {
 
   template <typename T, class BinaryOperation>
   EnableIfIsScalarArithmetic<T> inclusive_scan(T x, BinaryOperation op) const {
-    return detail::sub_group::calc<T, __spv::GroupOperation::InclusiveScan>(
-        typename detail::sub_group::GroupOpTag<T>::type(), x, op);
+    return detail::calc<T, __spv::GroupOperation::InclusiveScan,
+                        __spv::Scope::Subgroup>(
+        typename detail::GroupOpTag<T>::type(), x, op);
   }
 
   template <typename T, class BinaryOperation>
   EnableIfIsScalarArithmetic<T> inclusive_scan(T x, BinaryOperation op,
-                                         T init) const {
+                                               T init) const {
     if (get_local_id().get(0) == 0) {
       x = op(init, x);
     }
@@ -271,8 +223,7 @@ struct sub_group {
   /* --- one-input shuffles --- */
   /* indices in [0 , sub_group size) */
 
-  template <typename T>
-  T shuffle(T x, id<1> local_id) const {
+  template <typename T> T shuffle(T x, id<1> local_id) const {
     return detail::sub_group::shuffle(x, local_id);
   }
 
@@ -280,21 +231,18 @@ struct sub_group {
     return detail::sub_group::shuffle_down(x, x, delta);
   }
 
-  template <typename T>
-  T shuffle_up(T x, uint32_t delta) const {
+  template <typename T> T shuffle_up(T x, uint32_t delta) const {
     return detail::sub_group::shuffle_up(x, x, delta);
   }
 
-  template <typename T>
-  T shuffle_xor(T x, id<1> value) const {
+  template <typename T> T shuffle_xor(T x, id<1> value) const {
     return detail::sub_group::shuffle_xor(x, value);
   }
 
   /* --- two-input shuffles --- */
   /* indices in [0 , 2 * sub_group size) */
 
-  template <typename T>
-  T shuffle(T x, T y, id<1> local_id) const {
+  template <typename T> T shuffle(T x, T y, id<1> local_id) const {
     return detail::sub_group::shuffle_down(x, y,
                                            (local_id - get_local_id()).get(0));
   }
diff --git a/sycl/test/group-algorithm/all_of.cpp b/sycl/test/group-algorithm/all_of.cpp
new file mode 100644
index 0000000000000..a8b4fc4bfff2b
--- /dev/null
+++ b/sycl/test/group-algorithm/all_of.cpp
@@ -0,0 +1,74 @@
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+using namespace sycl;
+using namespace sycl::intel;
+
+template <class Predicate>
+class all_of_kernel;
+
+struct GeZero {
+  bool operator()(int i) const { return i >= 0; }
+};
+struct IsEven {
+  bool operator()(int i) const { return (i % 2) == 0; }
+};
+struct LtZero {
+  bool operator()(int i) const { return i < 0; }
+};
+
+template <typename InputContainer, typename OutputContainer, class Predicate>
+void test(queue q, InputContainer input, OutputContainer output,
+          Predicate pred) {
+  typedef class all_of_kernel<Predicate> kernel_name;
+  size_t N = input.size();
+  size_t G = 16;
+  {
+    buffer<int> in_buf(input.data(), input.size());
+    buffer<bool> out_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto in = in_buf.get_access<access::mode::read>(cgh);
+      auto out = out_buf.get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<kernel_name>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        int lid = it.get_local_id(0);
+        out[0] = all_of(g, pred(in[lid]));
+        out[1] = all_of(g, in[lid], pred);
+        out[2] = all_of(g, in.get_pointer(), in.get_pointer() + N, pred);
+      });
+    });
+  }
+  bool expected = std::all_of(input.begin(), input.end(), pred);
+  assert(output[0] == expected);
+  assert(output[1] == expected);
+  assert(output[2] == expected);
+}
+
+int main() {
+  queue q;
+  std::string version = q.get_device().get_info<info::device::version>();
+  if (version < std::string("2.0")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 32;
+  std::array<int, N> input;
+  std::array<bool, 3> output;
+  std::iota(input.begin(), input.end(), 0);
+  std::fill(output.begin(), output.end(), false);
+
+  test(q, input, output, GeZero());
+  test(q, input, output, IsEven());
+  test(q, input, output, LtZero());
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/sycl/test/group-algorithm/any_of.cpp b/sycl/test/group-algorithm/any_of.cpp
new file mode 100644
index 0000000000000..4e5391b5b01be
--- /dev/null
+++ b/sycl/test/group-algorithm/any_of.cpp
@@ -0,0 +1,76 @@
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+using namespace sycl;
+using namespace sycl::intel;
+
+template <class Predicate>
+class any_of_kernel;
+
+struct GeZero {
+  bool operator()(int i) const { return i >= 0; }
+};
+struct IsEven {
+  bool operator()(int i) const { return (i % 2) == 0; }
+};
+struct LtZero {
+  bool operator()(int i) const { return i < 0; }
+};
+
+template <typename InputContainer, typename OutputContainer, class Predicate>
+void test(queue q, InputContainer input, OutputContainer output,
+          Predicate pred) {
+  typedef typename InputContainer::value_type InputT;
+  typedef typename OutputContainer::value_type OutputT;
+  typedef class any_of_kernel<Predicate> kernel_name;
+  size_t N = input.size();
+  size_t G = 16;
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto in = in_buf.template get_access<access::mode::read>(cgh);
+      auto out = out_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<kernel_name>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        int lid = it.get_local_id(0);
+        out[0] = any_of(g, pred(in[lid]));
+        out[1] = any_of(g, in[lid], pred);
+        out[2] = any_of(g, in.get_pointer(), in.get_pointer() + N, pred);
+      });
+    });
+  }
+  bool expected = std::any_of(input.begin(), input.end(), pred);
+  assert(output[0] == expected);
+  assert(output[1] == expected);
+  assert(output[2] == expected);
+}
+
+int main() {
+  queue q;
+  std::string version = q.get_device().get_info<info::device::version>();
+  if (version < std::string("2.0")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 32;
+  std::array<int, N> input;
+  std::array<bool, 3> output;
+  std::iota(input.begin(), input.end(), 0);
+  std::fill(output.begin(), output.end(), false);
+
+  test(q, input, output, GeZero());
+  test(q, input, output, IsEven());
+  test(q, input, output, LtZero());
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/sycl/test/group-algorithm/broadcast.cpp b/sycl/test/group-algorithm/broadcast.cpp
new file mode 100644
index 0000000000000..9fcce3b938673
--- /dev/null
+++ b/sycl/test/group-algorithm/broadcast.cpp
@@ -0,0 +1,62 @@
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+using namespace sycl;
+using namespace sycl::intel;
+
+class broadcast_kernel;
+
+template <typename InputContainer, typename OutputContainer>
+void test(queue q, InputContainer input, OutputContainer output) {
+  typedef typename InputContainer::value_type InputT;
+  typedef typename OutputContainer::value_type OutputT;
+  typedef class broadcast_kernel kernel_name;
+  size_t N = input.size();
+  size_t G = 4;
+  range<2> R(G, G);
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto in = in_buf.template get_access<access::mode::read>(cgh);
+      auto out = out_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<kernel_name>(nd_range<2>(R, R), [=](nd_item<2> it) {
+        group<2> g = it.get_group();
+        int lid = it.get_local_linear_id();
+        out[0] = broadcast(g, in[lid]);
+        out[1] = broadcast(g, in[lid], group<2>::id_type(1, 2));
+        out[2] = broadcast(g, in[lid], group<2>::linear_id_type(2 * G + 1));
+      });
+    });
+  }
+  assert(output[0] == input[0]);
+  assert(output[1] == input[1 * G + 2]);
+  assert(output[2] == input[2 * G + 1]);
+}
+
+int main() {
+  queue q;
+  std::string version = q.get_device().get_info<info::device::version>();
+  if (version < std::string("2.0")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 16;
+  std::array<int, N> input;
+  std::array<int, N> output;
+  std::iota(input.begin(), input.end(), 1);
+  std::fill(output.begin(), output.end(), false);
+
+  test(q, input, output);
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/sycl/test/group-algorithm/exclusive_scan.cpp b/sycl/test/group-algorithm/exclusive_scan.cpp
new file mode 100644
index 0000000000000..fad4777a7cec1
--- /dev/null
+++ b/sycl/test/group-algorithm/exclusive_scan.cpp
@@ -0,0 +1,144 @@
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <limits>
+#include <numeric>
+#include <vector>
+using namespace sycl;
+using namespace sycl::intel;
+
+template <class BinaryOperation, int TestNumber>
+class exclusive_scan_kernel;
+
+// std::exclusive_scan isn't implemented yet, so use serial implementation
+// instead
+namespace emu {
+template <typename InputIterator, typename OutputIterator,
+          class BinaryOperation, typename T>
+OutputIterator exclusive_scan(InputIterator first, InputIterator last,
+                              OutputIterator result, T init,
+                              BinaryOperation binary_op) {
+  T partial = init;
+  for (InputIterator it = first; it != last; ++it) {
+    *(result++) = partial;
+    partial = binary_op(partial, *it);
+  }
+  return result;
+}
+} // namespace emu
+
+template <typename InputContainer, typename OutputContainer,
+          class BinaryOperation>
+void test(queue q, InputContainer input, OutputContainer output,
+          BinaryOperation binary_op,
+          typename OutputContainer::value_type identity) {
+  typedef typename InputContainer::value_type InputT;
+  typedef typename OutputContainer::value_type OutputT;
+  typedef class exclusive_scan_kernel<BinaryOperation, 0> kernel_name0;
+  typedef class exclusive_scan_kernel<BinaryOperation, 1> kernel_name1;
+  typedef class exclusive_scan_kernel<BinaryOperation, 2> kernel_name2;
+  typedef class exclusive_scan_kernel<BinaryOperation, 3> kernel_name3;
+  OutputT init = 42;
+  size_t N = input.size();
+  size_t G = 16;
+  std::vector<OutputT> expected(N);
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+      auto in = in_buf.template get_access<access::mode::read>(cgh);
+      auto out = out_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<kernel_name0>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        int lid = it.get_local_id(0);
+        out[lid] = exclusive_scan(g, in[lid], binary_op);
+      });
+    });
+  }
+  emu::exclusive_scan(input.begin(), input.begin() + G, expected.begin(),
+                      identity, binary_op);
+  assert(std::equal(output.begin(), output.begin() + G, expected.begin()));
+
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+      auto in = in_buf.template get_access<access::mode::read>(cgh);
+      auto out = out_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<kernel_name1>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        int lid = it.get_local_id(0);
+        out[lid] = exclusive_scan(g, in[lid], init, binary_op);
+      });
+    });
+  }
+  emu::exclusive_scan(input.begin(), input.begin() + G, expected.begin(), init,
+                      binary_op);
+  assert(std::equal(output.begin(), output.begin() + G, expected.begin()));
+
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+      auto in = in_buf.template get_access<access::mode::read>(cgh);
+      auto out = out_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<kernel_name2>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        exclusive_scan(g, in.get_pointer(), in.get_pointer() + N,
+                       out.get_pointer(), binary_op);
+      });
+    });
+  }
+  emu::exclusive_scan(input.begin(), input.begin() + N, expected.begin(),
+                      identity, binary_op);
+  assert(std::equal(output.begin(), output.begin() + N, expected.begin()));
+
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+      auto in = in_buf.template get_access<access::mode::read>(cgh);
+      auto out = out_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<kernel_name3>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        exclusive_scan(g, in.get_pointer(), in.get_pointer() + N,
+                       out.get_pointer(), init, binary_op);
+      });
+    });
+  }
+  emu::exclusive_scan(input.begin(), input.begin() + N, expected.begin(), init,
+                      binary_op);
+  assert(std::equal(output.begin(), output.begin() + N, expected.begin()));
+}
+
+int main() {
+  queue q;
+  std::string version = q.get_device().get_info<info::device::version>();
+  if (version < std::string("2.0")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 32;
+  std::array<int, N> input;
+  std::array<int, N> output;
+  std::iota(input.begin(), input.end(), 0);
+  std::fill(output.begin(), output.end(), 0);
+
+#if __cplusplus >= 201402L
+  test(q, input, output, plus<>(), 0);
+  test(q, input, output, minimum<>(), std::numeric_limits<int>::max());
+  test(q, input, output, maximum<>(), std::numeric_limits<int>::lowest());
+#endif
+  test(q, input, output, plus<int>(), 0);
+  test(q, input, output, minimum<int>(), std::numeric_limits<int>::max());
+  test(q, input, output, maximum<int>(), std::numeric_limits<int>::lowest());
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/sycl/test/group-algorithm/inclusive_scan.cpp b/sycl/test/group-algorithm/inclusive_scan.cpp
new file mode 100644
index 0000000000000..54d79f72e5395
--- /dev/null
+++ b/sycl/test/group-algorithm/inclusive_scan.cpp
@@ -0,0 +1,144 @@
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <limits>
+#include <numeric>
+#include <vector>
+using namespace sycl;
+using namespace sycl::intel;
+
+template <class BinaryOperation, int TestNumber>
+class inclusive_scan_kernel;
+
+// std::inclusive_scan isn't implemented yet, so use serial implementation
+// instead
+namespace emu {
+template <typename InputIterator, typename OutputIterator,
+          class BinaryOperation, typename T>
+OutputIterator inclusive_scan(InputIterator first, InputIterator last,
+                              OutputIterator result, BinaryOperation binary_op,
+                              T init) {
+  T partial = init;
+  for (InputIterator it = first; it != last; ++it) {
+    partial = binary_op(partial, *it);
+    *(result++) = partial;
+  }
+  return result;
+}
+} // namespace emu
+
+template <typename InputContainer, typename OutputContainer,
+          class BinaryOperation>
+void test(queue q, InputContainer input, OutputContainer output,
+          BinaryOperation binary_op,
+          typename OutputContainer::value_type identity) {
+  typedef typename InputContainer::value_type InputT;
+  typedef typename OutputContainer::value_type OutputT;
+  typedef class inclusive_scan_kernel<BinaryOperation, 0> kernel_name0;
+  typedef class inclusive_scan_kernel<BinaryOperation, 1> kernel_name1;
+  typedef class inclusive_scan_kernel<BinaryOperation, 2> kernel_name2;
+  typedef class inclusive_scan_kernel<BinaryOperation, 3> kernel_name3;
+  OutputT init = 42;
+  size_t N = input.size();
+  size_t G = 16;
+  std::vector<OutputT> expected(N);
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+      auto in = in_buf.template get_access<access::mode::read>(cgh);
+      auto out = out_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<kernel_name0>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        int lid = it.get_local_id(0);
+        out[lid] = inclusive_scan(g, in[lid], binary_op);
+      });
+    });
+  }
+  emu::inclusive_scan(input.begin(), input.begin() + G, expected.begin(),
+                      binary_op, identity);
+  assert(std::equal(output.begin(), output.begin() + G, expected.begin()));
+
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+      auto in = in_buf.template get_access<access::mode::read>(cgh);
+      auto out = out_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<kernel_name1>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        int lid = it.get_local_id(0);
+        out[lid] = inclusive_scan(g, in[lid], binary_op, init);
+      });
+    });
+  }
+  emu::inclusive_scan(input.begin(), input.begin() + G, expected.begin(),
+                      binary_op, init);
+  assert(std::equal(output.begin(), output.begin() + G, expected.begin()));
+
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+      auto in = in_buf.template get_access<access::mode::read>(cgh);
+      auto out = out_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<kernel_name2>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        inclusive_scan(g, in.get_pointer(), in.get_pointer() + N,
+                       out.get_pointer(), binary_op);
+      });
+    });
+  }
+  emu::inclusive_scan(input.begin(), input.begin() + N, expected.begin(),
+                      binary_op, identity);
+  assert(std::equal(output.begin(), output.begin() + N, expected.begin()));
+
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+    q.submit([&](handler &cgh) {
+      auto in = in_buf.template get_access<access::mode::read>(cgh);
+      auto out = out_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<kernel_name3>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        inclusive_scan(g, in.get_pointer(), in.get_pointer() + N,
+                       out.get_pointer(), binary_op, init);
+      });
+    });
+  }
+  emu::inclusive_scan(input.begin(), input.begin() + N, expected.begin(),
+                      binary_op, init);
+  assert(std::equal(output.begin(), output.begin() + N, expected.begin()));
+}
+
+int main() {
+  queue q;
+  std::string version = q.get_device().get_info<info::device::version>();
+  if (version < std::string("2.0")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 32;
+  std::array<int, N> input;
+  std::array<int, N> output;
+  std::iota(input.begin(), input.end(), 0);
+  std::fill(output.begin(), output.end(), 0);
+
+#if __cplusplus >= 201402L
+  test(q, input, output, plus<>(), 0);
+  test(q, input, output, minimum<>(), std::numeric_limits<int>::max());
+  test(q, input, output, maximum<>(), std::numeric_limits<int>::lowest());
+#endif
+  test(q, input, output, plus<int>(), 0);
+  test(q, input, output, minimum<int>(), std::numeric_limits<int>::max());
+  test(q, input, output, maximum<int>(), std::numeric_limits<int>::lowest());
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/sycl/test/group-algorithm/leader.cpp b/sycl/test/group-algorithm/leader.cpp
new file mode 100644
index 0000000000000..3e0bad4706cfc
--- /dev/null
+++ b/sycl/test/group-algorithm/leader.cpp
@@ -0,0 +1,47 @@
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <cassert>
+using namespace sycl;
+using namespace sycl::intel;
+
+class leader_kernel;
+
+void test(queue q) {
+  typedef class leader_kernel kernel_name;
+  int out = 0;
+  size_t G = 4;
+
+  range<2> R(G, G);
+  {
+    buffer<int> out_buf(&out, 1);
+
+    q.submit([&](handler &cgh) {
+      auto out = out_buf.template get_access<access::mode::read_write>(cgh);
+      cgh.parallel_for<kernel_name>(nd_range<2>(R, R), [=](nd_item<2> it) {
+        group<2> g = it.get_group();
+        if (leader(g)) {
+          out[0] += 1;
+        }
+      });
+    });
+  }
+  assert(out == 1);
+}
+
+int main() {
+  queue q;
+  std::string version = q.get_device().get_info<info::device::version>();
+  if (version < std::string("2.0")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  test(q);
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/sycl/test/group-algorithm/none_of.cpp b/sycl/test/group-algorithm/none_of.cpp
new file mode 100644
index 0000000000000..d0ef19b8ed3ea
--- /dev/null
+++ b/sycl/test/group-algorithm/none_of.cpp
@@ -0,0 +1,74 @@
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <numeric>
+using namespace sycl;
+using namespace sycl::intel;
+
+template <class Predicate>
+class none_of_kernel;
+
+struct GeZero {
+  bool operator()(int i) const { return i >= 0; }
+};
+struct IsEven {
+  bool operator()(int i) const { return (i % 2) == 0; }
+};
+struct LtZero {
+  bool operator()(int i) const { return i < 0; }
+};
+
+template <typename InputContainer, typename OutputContainer, class Predicate>
+void test(queue q, InputContainer input, OutputContainer output,
+          Predicate pred) {
+  typedef class none_of_kernel<Predicate> kernel_name;
+  size_t N = input.size();
+  size_t G = 16;
+  {
+    buffer<int> in_buf(input.data(), input.size());
+    buffer<bool> out_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto in = in_buf.get_access<access::mode::read>(cgh);
+      auto out = out_buf.get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<kernel_name>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        int lid = it.get_local_id(0);
+        out[0] = none_of(g, pred(in[lid]));
+        out[1] = none_of(g, in[lid], pred);
+        out[2] = none_of(g, in.get_pointer(), in.get_pointer() + N, pred);
+      });
+    });
+  }
+  bool expected = std::none_of(input.begin(), input.end(), pred);
+  assert(output[0] == expected);
+  assert(output[1] == expected);
+  assert(output[2] == expected);
+}
+
+int main() {
+  queue q;
+  std::string version = q.get_device().get_info<info::device::version>();
+  if (version < std::string("2.0")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 32;
+  std::array<int, N> input;
+  std::array<bool, 3> output;
+  std::iota(input.begin(), input.end(), 0);
+  std::fill(output.begin(), output.end(), false);
+
+  test(q, input, output, GeZero());
+  test(q, input, output, IsEven());
+  test(q, input, output, LtZero());
+
+  std::cout << "Test passed." << std::endl;
+}
diff --git a/sycl/test/group-algorithm/reduce.cpp b/sycl/test/group-algorithm/reduce.cpp
new file mode 100644
index 0000000000000..988c40f245ff7
--- /dev/null
+++ b/sycl/test/group-algorithm/reduce.cpp
@@ -0,0 +1,82 @@
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+#include <CL/sycl.hpp>
+#include <algorithm>
+#include <cassert>
+#include <limits>
+#include <numeric>
+using namespace sycl;
+using namespace sycl::intel;
+
+template <class BinaryOperation>
+class reduce_kernel;
+
+template <typename InputContainer, typename OutputContainer,
+          class BinaryOperation>
+void test(queue q, InputContainer input, OutputContainer output,
+          BinaryOperation binary_op,
+          typename OutputContainer::value_type identity) {
+  typedef typename InputContainer::value_type InputT;
+  typedef typename OutputContainer::value_type OutputT;
+  typedef class reduce_kernel<BinaryOperation> kernel_name;
+  OutputT init = 42;
+  size_t N = input.size();
+  size_t G = 16;
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+
+    q.submit([&](handler &cgh) {
+      auto in = in_buf.template get_access<access::mode::read>(cgh);
+      auto out = out_buf.template get_access<access::mode::discard_write>(cgh);
+      cgh.parallel_for<kernel_name>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        int lid = it.get_local_id(0);
+        out[0] = reduce(g, in[lid], binary_op);
+        out[1] = reduce(g, in[lid], init, binary_op);
+        out[2] = reduce(g, in.get_pointer(), in.get_pointer() + N, binary_op);
+        out[3] =
+            reduce(g, in.get_pointer(), in.get_pointer() + N, init, binary_op);
+      });
+    });
+  }
+  // std::reduce is not implemented yet, so use std::accumulate instead
+  assert(output[0] == std::accumulate(input.begin(), input.begin() + G,
+                                      identity, binary_op));
+  assert(output[1] ==
+         std::accumulate(input.begin(), input.begin() + G, init, binary_op));
+  assert(output[2] ==
+         std::accumulate(input.begin(), input.end(), identity, binary_op));
+  assert(output[3] ==
+         std::accumulate(input.begin(), input.end(), init, binary_op));
+}
+
+int main() {
+  queue q;
+  std::string version = q.get_device().get_info<info::device::version>();
+  if (version < std::string("2.0")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 32;
+  std::array<int, N> input;
+  std::array<int, 4> output;
+  std::iota(input.begin(), input.end(), 0);
+  std::fill(output.begin(), output.end(), 0);
+
+#if __cplusplus >= 201402L
+  test(q, input, output, plus<>(), 0);
+  test(q, input, output, minimum<>(), std::numeric_limits<int>::max());
+  test(q, input, output, maximum<>(), std::numeric_limits<int>::lowest());
+#endif
+  test(q, input, output, plus<int>(), 0);
+  test(q, input, output, minimum<int>(), std::numeric_limits<int>::max());
+  test(q, input, output, maximum<int>(), std::numeric_limits<int>::lowest());
+
+  std::cout << "Test passed." << std::endl;
+}