diff --git a/sycl/include/CL/sycl/detail/type_traits.hpp b/sycl/include/CL/sycl/detail/type_traits.hpp
index bde91d54f49ef..3a50940c25a03 100644
--- a/sycl/include/CL/sycl/detail/type_traits.hpp
+++ b/sycl/include/CL/sycl/detail/type_traits.hpp
@@ -157,6 +157,10 @@ template <typename T, int N, template <typename> class S>
 using is_gen_based_on_type_sizeof =
     bool_constant<S<T>::value && (sizeof(vector_element_t<T>) == N)>;
 
+template <typename> struct is_vec : std::false_type {};
+template <typename T, std::size_t N>
+struct is_vec<cl::sycl::vec<T, N>> : std::true_type {};
+
 // is_integral
 template <typename T>
 struct is_integral : std::is_integral<vector_element_t<T>> {};
diff --git a/sycl/include/CL/sycl/intel/sub_group.hpp b/sycl/include/CL/sycl/intel/sub_group.hpp
index 8de208238b71a..e7eeb9d8844d9 100644
--- a/sycl/include/CL/sycl/intel/sub_group.hpp
+++ b/sycl/include/CL/sycl/intel/sub_group.hpp
@@ -8,15 +8,21 @@
 
 #pragma once
 
+#include <CL/__spirv/spirv_ops.hpp>
 #include <CL/__spirv/spirv_vars.hpp>
 #include <CL/sycl/access/access.hpp>
+#include <CL/sycl/detail/generic_type_traits.hpp>
 #include <CL/sycl/detail/helpers.hpp>
 #include <CL/sycl/detail/type_traits.hpp>
 #include <CL/sycl/id.hpp>
 #include <CL/sycl/intel/functional.hpp>
 #include <CL/sycl/range.hpp>
 #include <CL/sycl/types.hpp>
+
+#include <cstring> // std::memcpy
+#include <numeric> // std::bit_cast
 #include <type_traits>
+
 #ifdef __SYCL_DEVICE_ONLY__
 
 __SYCL_INLINE namespace cl {
@@ -25,69 +31,157 @@ template <typename T, access::address_space Space> class multi_ptr;
 
 namespace detail {
 
-template <typename> struct is_vec : std::false_type {};
-template <typename T, std::size_t N>
-struct is_vec<cl::sycl::vec<T, N>> : std::true_type {};
+namespace sub_group {
 
-template <typename T, __spv::GroupOperation O>
-static typename std::enable_if<
-    !detail::is_floating_point<T>::value && std::is_signed<T>::value, T>::type
-calc(T x, intel::minimum<T> op) {
-  return __spirv_GroupSMin(__spv::Scope::Subgroup, O, x);
+template <typename T> T broadcast(T x, id<1> local_id) {
+  using OCLT = detail::ConvertToOpenCLType_t<T>;
+  return __spirv_GroupBroadcast(__spv::Scope::Subgroup, OCLT(x),
+                                local_id.get(0));
 }
 
-template <typename T, __spv::GroupOperation O>
-static typename std::enable_if<
-    !detail::is_floating_point<T>::value && std::is_unsigned<T>::value, T>::type
-calc(T x, intel::minimum<T> op) {
-  return __spirv_GroupUMin(__spv::Scope::Subgroup, O, x);
-}
+#define __SYCL_SG_GENERATE_BODY_1ARG(name, SPIRVOperation)                     \
+  template <typename T> T name(T x, id<1> local_id) {                          \
+    using OCLT = detail::ConvertToOpenCLType_t<T>;                             \
+    return __spirv_##SPIRVOperation(OCLT(x), local_id.get(0));                 \
+  }
 
-template <typename T, __spv::GroupOperation O>
-static typename std::enable_if<detail::is_floating_point<T>::value, T>::type
-calc(T x, intel::minimum<T> op) {
-  return __spirv_GroupFMin(__spv::Scope::Subgroup, O, x);
-}
+__SYCL_SG_GENERATE_BODY_1ARG(shuffle, SubgroupShuffleINTEL)
+__SYCL_SG_GENERATE_BODY_1ARG(shuffle_xor, SubgroupShuffleXorINTEL)
+
+#undef __SYCL_SG_GENERATE_BODY_1ARG
+
+#define __SYCL_SG_GENERATE_BODY_2ARG(name, SPIRVOperation)                     \
+  template <typename T> T name(T A, T B, uint32_t Delta) {                     \
+    using OCLT = detail::ConvertToOpenCLType_t<T>;                             \
+    return __spirv_##SPIRVOperation(OCLT(A), OCLT(B), Delta);                  \
+  }
+
+__SYCL_SG_GENERATE_BODY_2ARG(shuffle_down, SubgroupShuffleDownINTEL)
+__SYCL_SG_GENERATE_BODY_2ARG(shuffle_up, SubgroupShuffleUpINTEL)
 
-template <typename T, __spv::GroupOperation O>
-static typename std::enable_if<
-    !detail::is_floating_point<T>::value && std::is_signed<T>::value, T>::type
-calc(T x, intel::maximum<T> op) {
-  return __spirv_GroupSMax(__spv::Scope::Subgroup, O, x);
+#undef __SYCL_SG_GENERATE_BODY_2ARG
+
+// Selects 8-bit, 16-bit or 32-bit type depending on size of T. If T doesn't
+// maps to mentioned types, then void is returned
+template <typename T>
+using SelectBlockT =
+    select_apply_cl_scalar_t<T, uint8_t, uint16_t, uint32_t, void>;
+
+template <typename T, access::address_space Space>
+using AcceptableForLoadStore =
+    bool_constant<!std::is_same<void, SelectBlockT<T>>::value &&
+                  Space == access::address_space::global_space>;
+
+// TODO: move this to public cl::sycl::bit_cast as extension?
+template <typename To, typename From> To bit_cast(const From &from) {
+#if __cpp_lib_bit_cast
+  return std::bit_cast<To>(from);
+#else
+
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif // __has_builtin
+
+#if __has_builtin(__builtin_bit_cast)
+  return __builtin_bit_cast(To, from);
+#else
+  To to;
+  std::memcpy(&to, &from, sizeof(To));
+  return to;
+#endif // __has_builtin(__builtin_bit_cast)
+#endif // __cpp_lib_bit_cast
 }
 
-template <typename T, __spv::GroupOperation O>
-static typename std::enable_if<
-    !detail::is_floating_point<T>::value && std::is_unsigned<T>::value, T>::type
-calc(T x, intel::maximum<T> op) {
-  return __spirv_GroupUMax(__spv::Scope::Subgroup, O, x);
+template <typename T, access::address_space Space>
+T load(const multi_ptr<T, Space> src) {
+  using BlockT = SelectBlockT<T>;
+  using PtrT = detail::ConvertToOpenCLType_t<const multi_ptr<BlockT, Space>>;
+
+  BlockT Ret =
+      __spirv_SubgroupBlockReadINTEL<BlockT>(reinterpret_cast<PtrT>(src.get()));
+
+  return bit_cast<T>(Ret);
 }
 
-template <typename T, __spv::GroupOperation O>
-static typename std::enable_if<detail::is_floating_point<T>::value, T>::type
-calc(T x, intel::maximum<T> op) {
-  return __spirv_GroupFMax(__spv::Scope::Subgroup, O, x);
+template <int N, typename T, access::address_space Space>
+vec<T, N> load(const multi_ptr<T, Space> src) {
+  using BlockT = SelectBlockT<T>;
+  using VecT = detail::ConvertToOpenCLType_t<vec<BlockT, N>>;
+  using PtrT = detail::ConvertToOpenCLType_t<const multi_ptr<BlockT, Space>>;
+
+  VecT Ret =
+      __spirv_SubgroupBlockReadINTEL<VecT>(reinterpret_cast<PtrT>(src.get()));
+
+  return bit_cast<typename vec<T, N>::vector_t>(Ret);
 }
 
-template <typename T, __spv::GroupOperation O>
-static typename std::enable_if<
-    !detail::is_floating_point<T>::value && std::is_integral<T>::value, T>::type
-calc(T x, intel::plus<T> op) {
-  return __spirv_GroupIAdd<T>(__spv::Scope::Subgroup, O, x);
+template <typename T, access::address_space Space>
+void store(multi_ptr<T, Space> dst, const T &x) {
+  using BlockT = SelectBlockT<T>;
+  using PtrT = detail::ConvertToOpenCLType_t<multi_ptr<BlockT, Space>>;
+
+  __spirv_SubgroupBlockWriteINTEL(reinterpret_cast<PtrT>(dst.get()),
+                                  bit_cast<BlockT>(x));
 }
 
-template <typename T, __spv::GroupOperation O>
-static typename std::enable_if<detail::is_floating_point<T>::value, T>::type
-calc(T x, intel::plus<T> op) {
-  return __spirv_GroupFAdd<T>(__spv::Scope::Subgroup, O, x);
+template <int N, typename T, access::address_space Space>
+void store(multi_ptr<T, Space> dst, const vec<T, N> &x) {
+  using BlockT = SelectBlockT<T>;
+  using VecT = detail::ConvertToOpenCLType_t<vec<BlockT, N>>;
+  using PtrT = detail::ConvertToOpenCLType_t<const multi_ptr<BlockT, Space>>;
+
+  __spirv_SubgroupBlockWriteINTEL(reinterpret_cast<PtrT>(dst.get()),
+                                  bit_cast<VecT>(x));
 }
 
+struct GroupOpISigned {}; struct GroupOpIUnsigned {}; struct GroupOpFP {};
+
+template <typename T, typename = void> struct GroupOpTag;
+
+template <typename T>
+struct GroupOpTag<T, detail::enable_if_t<detail::is_sigeninteger<T>::value>> {
+  using type = GroupOpISigned;
+};
+
+template <typename T>
+struct GroupOpTag<T, detail::enable_if_t<detail::is_sugeninteger<T>::value>> {
+  using type = GroupOpIUnsigned;
+};
+
+template <typename T>
+struct GroupOpTag<T, detail::enable_if_t<detail::is_sgenfloat<T>::value>> {
+  using type = GroupOpFP;
+};
+
+#define __SYCL_SG_CALC_OVERLOAD(GroupTag, SPIRVOperation, BinaryOperation)     \
+  template <typename T, __spv::GroupOperation O>                               \
+  static T calc(GroupTag, T x, BinaryOperation op) {                           \
+    using OCLT = detail::ConvertToOpenCLType_t<T>;                             \
+    OCLT Arg = x;                                                              \
+    OCLT Ret = __spirv_Group##SPIRVOperation(__spv::Scope::Subgroup, O, Arg);  \
+    return Ret;                                                                \
+  }
+
+__SYCL_SG_CALC_OVERLOAD(GroupOpISigned, SMin, intel::minimum<T>)
+__SYCL_SG_CALC_OVERLOAD(GroupOpIUnsigned, UMin, intel::minimum<T>)
+__SYCL_SG_CALC_OVERLOAD(GroupOpFP, FMin, intel::minimum<T>)
+__SYCL_SG_CALC_OVERLOAD(GroupOpISigned, SMax, intel::maximum<T>)
+__SYCL_SG_CALC_OVERLOAD(GroupOpIUnsigned, UMax, intel::maximum<T>)
+__SYCL_SG_CALC_OVERLOAD(GroupOpFP, FMax, intel::maximum<T>)
+__SYCL_SG_CALC_OVERLOAD(GroupOpISigned, IAdd<T>, intel::plus<T>)
+__SYCL_SG_CALC_OVERLOAD(GroupOpIUnsigned, IAdd<T>, intel::plus<T>)
+__SYCL_SG_CALC_OVERLOAD(GroupOpFP, FAdd<T>, intel::plus<T>)
+
+#undef __SYCL_SG_CALC_OVERLOAD
+
 template <typename T, __spv::GroupOperation O,
           template <typename> class BinaryOperation>
-static T calc(T x, BinaryOperation<void>) {
-  return calc<T, O>(x, BinaryOperation<T>());
+static T calc(typename GroupOpTag<T>::type, T x, BinaryOperation<void>) {
+  return calc<T, O>(typename GroupOpTag<T>::type(), x, BinaryOperation<T>());
 }
 
+} // namespace sub_group
+
 } // namespace detail
 
 namespace intel {
@@ -106,9 +200,7 @@ struct sub_group {
 
   id<1> get_group_id() const { return __spirv_BuiltInSubgroupId; }
 
-  unsigned int get_group_range() const {
-    return __spirv_BuiltInNumSubgroups;
-  }
+  unsigned int get_group_range() const { return __spirv_BuiltInNumSubgroups; }
 
   unsigned int get_uniform_group_range() const {
     return __spirv_BuiltInNumEnqueuedSubgroups;
@@ -124,7 +216,6 @@ struct sub_group {
     return __spirv_GroupAll(__spv::Scope::Subgroup, predicate);
   }
 
-
   template <typename T>
   using EnableIfIsScalarArithmetic = detail::enable_if_t<
     !detail::is_vec<T>::value && detail::is_arithmetic<T>::value, T>;
@@ -132,14 +223,14 @@ struct sub_group {
   /* --- collectives --- */
 
   template <typename T>
-  T broadcast(EnableIfIsScalarArithmetic<T> x, id<1> local_id) const {
-    return __spirv_GroupBroadcast<T>(__spv::Scope::Subgroup, x,
-                                            local_id.get(0));
+  EnableIfIsScalarArithmetic<T> broadcast(T x, id<1> local_id) const {
+    return detail::sub_group::broadcast(x, local_id);
   }
 
   template <typename T, class BinaryOperation>
   EnableIfIsScalarArithmetic<T> reduce(T x, BinaryOperation op) const {
-    return detail::calc<T, __spv::GroupOperation::Reduce>(x, op);
+    return detail::sub_group::calc<T, __spv::GroupOperation::Reduce>(
+        typename detail::sub_group::GroupOpTag<T>::type(), x, op);
   }
 
   template <typename T, class BinaryOperation>
@@ -149,12 +240,13 @@ struct sub_group {
 
   template <typename T, class BinaryOperation>
   EnableIfIsScalarArithmetic<T> exclusive_scan(T x, BinaryOperation op) const {
-    return detail::calc<T, __spv::GroupOperation::ExclusiveScan>(x, op);
+    return detail::sub_group::calc<T, __spv::GroupOperation::ExclusiveScan>(
+        typename detail::sub_group::GroupOpTag<T>::type(), x, op);
   }
 
   template <typename T, class BinaryOperation>
   EnableIfIsScalarArithmetic<T> exclusive_scan(T x, T init,
-                                         BinaryOperation op) const {
+                                               BinaryOperation op) const {
     if (get_local_id().get(0) == 0) {
       x = op(init, x);
     }
@@ -167,7 +259,8 @@ struct sub_group {
 
   template <typename T, class BinaryOperation>
   EnableIfIsScalarArithmetic<T> inclusive_scan(T x, BinaryOperation op) const {
-    return detail::calc<T, __spv::GroupOperation::InclusiveScan>(x, op);
+    return detail::sub_group::calc<T, __spv::GroupOperation::InclusiveScan>(
+        typename detail::sub_group::GroupOpTag<T>::type(), x, op);
   }
 
   template <typename T, class BinaryOperation>
@@ -179,197 +272,92 @@ struct sub_group {
     return inclusive_scan(x, op);
   }
 
-  /* --- one - input shuffles --- */
-  /* indices in [0 , sub - group size ) */
+  /* --- one-input shuffles --- */
+  /* indices in [0 , sub_group size) */
 
   template <typename T>
-  EnableIfIsScalarArithmetic<T>
-  shuffle(T x, id<1> local_id) const {
-    return __spirv_SubgroupShuffleINTEL(x, local_id.get(0));
+  T shuffle(T x, id<1> local_id) const {
+    return detail::sub_group::shuffle(x, local_id);
   }
 
-  template <typename T>
-  typename std::enable_if<detail::is_vec<T>::value, T>::type
-  shuffle(T x, id<1> local_id) const {
-    return __spirv_SubgroupShuffleINTEL((typename T::vector_t)x,
-                                               local_id.get(0));
+  template <typename T> T shuffle_down(T x, uint32_t delta) const {
+    return detail::sub_group::shuffle_down(x, x, delta);
   }
 
   template <typename T>
-  EnableIfIsScalarArithmetic<T>
-  shuffle_down(T x, uint32_t delta) const {
-    return shuffle_down(x, x, delta);
+  T shuffle_up(T x, uint32_t delta) const {
+    return detail::sub_group::shuffle_up(x, x, delta);
   }
 
   template <typename T>
-  typename std::enable_if<detail::is_vec<T>::value, T>::type
-  shuffle_down(T x, uint32_t delta) const {
-    return shuffle_down(x, x, delta);
+  T shuffle_xor(T x, id<1> value) const {
+    return detail::sub_group::shuffle_xor(x, value);
   }
 
-  template <typename T>
-  EnableIfIsScalarArithmetic<T>
-  shuffle_up(T x, uint32_t delta) const {
-    return shuffle_up(x, x, delta);
-  }
-
-  template <typename T>
-  typename std::enable_if<detail::is_vec<T>::value, T>::type
-  shuffle_up(T x, uint32_t delta) const {
-    return shuffle_up(x, x, delta);
-  }
+  /* --- two-input shuffles --- */
+  /* indices in [0 , 2 * sub_group size) */
 
   template <typename T>
-  EnableIfIsScalarArithmetic<T>
-  shuffle_xor(T x, id<1> value) const {
-    return __spirv_SubgroupShuffleXorINTEL(x, (uint32_t)value.get(0));
+  T shuffle(T x, T y, id<1> local_id) const {
+    return detail::sub_group::shuffle_down(x, y,
+                                           (local_id - get_local_id()).get(0));
   }
 
   template <typename T>
-  typename std::enable_if<detail::is_vec<T>::value, T>::type
-  shuffle_xor(T x, id<1> value) const {
-    return __spirv_SubgroupShuffleXorINTEL((typename T::vector_t)x,
-                                                  (uint32_t)value.get(0));
+  T shuffle_down(T current, T next, uint32_t delta) const {
+    return detail::sub_group::shuffle_down(current, next, delta);
   }
 
-  /* --- two - input shuffles --- */
-  /* indices in [0 , 2* sub - group size ) */
   template <typename T>
-  EnableIfIsScalarArithmetic<T>
-  shuffle(T x, T y, id<1> local_id) const {
-    return __spirv_SubgroupShuffleDownINTEL(
-        x, y, local_id.get(0) - get_local_id().get(0));
+  T shuffle_up(T previous, T current, uint32_t delta) const {
+    return detail::sub_group::shuffle_up(previous, current, delta);
   }
 
-  template <typename T>
-  typename std::enable_if<detail::is_vec<T>::value, T>::type
-  shuffle(T x, T y, id<1> local_id) const {
-    return __spirv_SubgroupShuffleDownINTEL(
-        (typename T::vector_t)x, (typename T::vector_t)y,
-        local_id.get(0) - get_local_id().get(0));
-  }
-
-  template <typename T>
-  EnableIfIsScalarArithmetic<T>
-  shuffle_down(T current, T next, uint32_t delta) const {
-    return __spirv_SubgroupShuffleDownINTEL(current, next, delta);
-  }
-
-  template <typename T>
-  typename std::enable_if<detail::is_vec<T>::value, T>::type
-  shuffle_down(T current, T next, uint32_t delta) const {
-    return __spirv_SubgroupShuffleDownINTEL(
-        (typename T::vector_t)current, (typename T::vector_t)next, delta);
-  }
-
-  template <typename T>
-  EnableIfIsScalarArithmetic<T>
-  shuffle_up(T previous, T current, uint32_t delta) const {
-    return __spirv_SubgroupShuffleUpINTEL(previous, current, delta);
-  }
-
-  template <typename T>
-  typename std::enable_if<detail::is_vec<T>::value, T>::type
-  shuffle_up(T previous, T current, uint32_t delta) const {
-    return __spirv_SubgroupShuffleUpINTEL(
-        (typename T::vector_t)previous, (typename T::vector_t)current, delta);
-  }
-
-  /* --- sub - group load / stores --- */
-  /* these can map to SIMD or block read / write hardware where available */
+  /* --- sub_group load/stores --- */
+  /* these can map to SIMD or block read/write hardware where available */
 
   template <typename T, access::address_space Space>
-  typename std::enable_if<(sizeof(T) == sizeof(uint32_t) ||
-                           sizeof(T) == sizeof(uint16_t) ||
-                           sizeof(T) == sizeof(uint8_t)) &&
-                              Space == access::address_space::global_space,
-                          T>::type
+  detail::enable_if_t<
+      detail::sub_group::AcceptableForLoadStore<T, Space>::value, T>
   load(const multi_ptr<T, Space> src) const {
-    T data;
-    if (sizeof(T) == sizeof(uint32_t)) {
-      uint32_t t = __spirv_SubgroupBlockReadINTEL<uint32_t>(
-          (const __attribute__((opencl_global)) uint32_t *)src.get());
-      data = *((T *)(&t));
-    } else if (sizeof(T) == sizeof(uint16_t)) {
-      uint16_t t = __spirv_SubgroupBlockReadINTEL<uint16_t>(
-          (const __attribute__((opencl_global)) uint16_t *)src.get());
-      data = *((T *)(&t));
-    } else {
-      uint8_t t = __spirv_SubgroupBlockReadINTEL<uint8_t>(
-          (const __attribute__((opencl_global)) uint8_t *)src.get());
-      data = *((T *)(&t));
-    }
-    return data;
+    return detail::sub_group::load(src);
   }
 
   template <int N, typename T, access::address_space Space>
-  vec<typename std::enable_if<(sizeof(T) == sizeof(uint32_t) ||
-                               sizeof(T) == sizeof(uint16_t)) &&
-                                  Space == access::address_space::global_space,
-                              T>::type,
-      N>
+  detail::enable_if_t<
+      detail::sub_group::AcceptableForLoadStore<T, Space>::value && N != 1,
+      vec<T, N>>
   load(const multi_ptr<T, Space> src) const {
-    if (N == 1) {
-      return load<T, Space>(src);
-    }
-    if (sizeof(T) == sizeof(uint32_t)) {
-      typedef uint32_t ocl_t __attribute__((ext_vector_type(N)));
-
-      ocl_t t = __spirv_SubgroupBlockReadINTEL<ocl_t>(
-          (const __attribute__((opencl_global)) uint32_t *)src.get());
-      return *((typename vec<T, N>::vector_t *)(&t));
-    }
-    typedef uint16_t ocl_t __attribute__((ext_vector_type(N)));
+    return detail::sub_group::load<N, T>(src);
+  }
 
-    ocl_t t = __spirv_SubgroupBlockReadINTEL<ocl_t>(
-        (const __attribute__((opencl_global)) uint16_t *)src.get());
-    return *((typename vec<T, N>::vector_t *)(&t));
+  template <int N, typename T, access::address_space Space>
+  detail::enable_if_t<
+      detail::sub_group::AcceptableForLoadStore<T, Space>::value && N == 1,
+      vec<T, 1>>
+  load(const multi_ptr<T, Space> src) const {
+    return detail::sub_group::load(src);
   }
 
   template <typename T, access::address_space Space>
-  void
-  store(multi_ptr<T, Space> dst,
-        const typename std::enable_if<
-            (sizeof(T) == sizeof(uint32_t) || sizeof(T) == sizeof(uint16_t) ||
-             sizeof(T) == sizeof(uint8_t)) &&
-                Space == access::address_space::global_space,
-            T>::type &x) const {
-    if (sizeof(T) == sizeof(uint32_t)) {
-      __spirv_SubgroupBlockWriteINTEL<uint32_t>(
-          (__attribute__((opencl_global)) uint32_t *)dst.get(), *((uint32_t *)&x));
-    } else if (sizeof(T) == sizeof(uint16_t)) {
-      __spirv_SubgroupBlockWriteINTEL<uint16_t>(
-          (__attribute__((opencl_global)) uint16_t *)dst.get(), *((uint16_t *)&x));
-    } else {
-      __spirv_SubgroupBlockWriteINTEL<uint8_t>(
-          (__attribute__((opencl_global)) uint8_t *)dst.get(), *((uint8_t *)&x));
-    }
+  detail::enable_if_t<
+      detail::sub_group::AcceptableForLoadStore<T, Space>::value>
+  store(multi_ptr<T, Space> dst, const T &x) const {
+    detail::sub_group::store(dst, x);
   }
 
   template <int N, typename T, access::address_space Space>
-  void store(multi_ptr<T, Space> dst,
-             const vec<typename std::enable_if<N == 1, T>::type, N> &x) const {
+  detail::enable_if_t<
+      detail::sub_group::AcceptableForLoadStore<T, Space>::value && N == 1>
+  store(multi_ptr<T, Space> dst, const vec<T, 1> &x) const {
     store<T, Space>(dst, x);
   }
 
   template <int N, typename T, access::address_space Space>
-  void store(
-      multi_ptr<T, Space> dst,
-      const vec<typename std::enable_if<
-                    (sizeof(T) == sizeof(uint32_t) ||
-                     sizeof(T) == sizeof(uint16_t)) &&
-                        N != 1 && Space == access::address_space::global_space,
-                    T>::type,
-                N> &x) const {
-    if (sizeof(T) == sizeof(uint32_t)) {
-      typedef uint32_t ocl_t __attribute__((ext_vector_type(N)));
-      __spirv_SubgroupBlockWriteINTEL((__attribute__((opencl_global)) uint32_t *)dst.get(),
-                                             *((ocl_t *)&x));
-    } else {
-      typedef uint16_t ocl_t __attribute__((ext_vector_type(N)));
-      __spirv_SubgroupBlockWriteINTEL((__attribute__((opencl_global)) uint16_t *)dst.get(),
-                                             *((ocl_t *)&x));
-    }
+  detail::enable_if_t<
+      detail::sub_group::AcceptableForLoadStore<T, Space>::value && N != 1>
+  store(multi_ptr<T, Space> dst, const vec<T, N> &x) const {
+    detail::sub_group::store(dst, x);
   }
 
   /* --- synchronization functions --- */